diff --git "a/logging.jsonl" "b/logging.jsonl" new file mode 100644--- /dev/null +++ "b/logging.jsonl" @@ -0,0 +1,6362 @@ +{"lm loss": 7.91819954, "grad_norm": 46.19406891, "learning_rate": 3.1e-07, "elapsed_time_per_iteration": 13.55991244, "memory(GiB)": 20.75, "elapsed_time": "13s", "remaining_time": "23h 57m 34s", "loss_scale": 1.0, "consumed_samples": 256, "global_step/max_steps": "1/6362"} +{"lm loss": 7.92144632, "grad_norm": 46.76684189, "learning_rate": 6.3e-07, "elapsed_time_per_iteration": 6.75834918, "memory(GiB)": 21.42, "elapsed_time": "20s", "remaining_time": "17h 56m 52s", "loss_scale": 1.0, "consumed_samples": 512, "global_step/max_steps": "2/6362"} +{"lm loss": 7.90478945, "grad_norm": 45.96073151, "learning_rate": 9.4e-07, "elapsed_time_per_iteration": 6.58475828, "memory(GiB)": 21.42, "elapsed_time": "26s", "remaining_time": "15h 50m 25s", "loss_scale": 1.0, "consumed_samples": 768, "global_step/max_steps": "3/6362"} +{"lm loss": 7.91293764, "grad_norm": 45.17134094, "learning_rate": 1.26e-06, "elapsed_time_per_iteration": 6.26647544, "memory(GiB)": 21.42, "elapsed_time": "33s", "remaining_time": "14h 38m 42s", "loss_scale": 1.0, "consumed_samples": 1024, "global_step/max_steps": "4/6362"} +{"lm loss": 7.90833092, "grad_norm": 45.6985054, "learning_rate": 1.57e-06, "elapsed_time_per_iteration": 6.62175536, "memory(GiB)": 21.45, "elapsed_time": "39s", "remaining_time": "14h 3m 10s", "loss_scale": 1.0, "consumed_samples": 1280, "global_step/max_steps": "5/6362"} +{"lm loss": 7.87562561, "grad_norm": 42.67409515, "learning_rate": 1.89e-06, "elapsed_time_per_iteration": 6.51397133, "memory(GiB)": 21.45, "elapsed_time": "46s", "remaining_time": "13h 37m 32s", "loss_scale": 1.0, "consumed_samples": 1536, "global_step/max_steps": "6/6362"} +{"lm loss": 7.80979824, "grad_norm": 36.27837753, "learning_rate": 2.2e-06, "elapsed_time_per_iteration": 6.49094772, "memory(GiB)": 21.45, "elapsed_time": "52s", "remaining_time": "13h 18m 51s", "loss_scale": 1.0, "consumed_samples": 1792, "global_step/max_steps": "7/6362"} +{"lm loss": 7.77377033, "grad_norm": 33.75679016, "learning_rate": 2.51e-06, "elapsed_time_per_iteration": 6.56242228, "memory(GiB)": 21.45, "elapsed_time": "59s", "remaining_time": "13h 5m 45s", "loss_scale": 1.0, "consumed_samples": 2048, "global_step/max_steps": "8/6362"} +{"lm loss": 7.6295023, "grad_norm": 21.54918861, "learning_rate": 2.83e-06, "elapsed_time_per_iteration": 6.54825449, "memory(GiB)": 21.45, "elapsed_time": "1m 5s", "remaining_time": "12h 55m 22s", "loss_scale": 1.0, "consumed_samples": 2304, "global_step/max_steps": "9/6362"} +{"lm loss": 7.65249872, "grad_norm": 20.6367321, "learning_rate": 3.14e-06, "elapsed_time_per_iteration": 6.53193903, "memory(GiB)": 21.45, "elapsed_time": "1m 12s", "remaining_time": "12h 46m 53s", "loss_scale": 1.0, "consumed_samples": 2560, "global_step/max_steps": "10/6362"} +{"lm loss": 7.60915995, "grad_norm": 18.86021805, "learning_rate": 3.46e-06, "elapsed_time_per_iteration": 6.66724372, "memory(GiB)": 21.45, "elapsed_time": "1m 19s", "remaining_time": "12h 41m 12s", "loss_scale": 1.0, "consumed_samples": 2816, "global_step/max_steps": "11/6362"} +{"lm loss": 7.44641972, "grad_norm": 11.0698328, "learning_rate": 3.77e-06, "elapsed_time_per_iteration": 6.6772902, "memory(GiB)": 21.45, "elapsed_time": "1m 25s", "remaining_time": "12h 36m 33s", "loss_scale": 1.0, "consumed_samples": 3072, "global_step/max_steps": "12/6362"} +{"lm loss": 7.42586756, "grad_norm": 10.69273949, "learning_rate": 4.09e-06, "elapsed_time_per_iteration": 6.54282045, "memory(GiB)": 21.46, "elapsed_time": "1m 32s", "remaining_time": "12h 31m 30s", "loss_scale": 1.0, "consumed_samples": 3328, "global_step/max_steps": "13/6362"} +{"lm loss": 7.38733482, "grad_norm": 10.03410435, "learning_rate": 4.4e-06, "elapsed_time_per_iteration": 6.6491549, "memory(GiB)": 21.46, "elapsed_time": "1m 38s", "remaining_time": "12h 27m 58s", "loss_scale": 1.0, "consumed_samples": 3584, "global_step/max_steps": "14/6362"} +{"lm loss": 7.39661646, "grad_norm": 9.11944199, "learning_rate": 4.72e-06, "elapsed_time_per_iteration": 6.4675715, "memory(GiB)": 21.46, "elapsed_time": "1m 45s", "remaining_time": "12h 23m 36s", "loss_scale": 1.0, "consumed_samples": 3840, "global_step/max_steps": "15/6362"} +{"lm loss": 7.30855227, "grad_norm": 10.72610664, "learning_rate": 5.03e-06, "elapsed_time_per_iteration": 6.56075573, "memory(GiB)": 21.46, "elapsed_time": "1m 52s", "remaining_time": "12h 20m 23s", "loss_scale": 1.0, "consumed_samples": 4096, "global_step/max_steps": "16/6362"} +{"lm loss": 7.28512096, "grad_norm": 10.53947163, "learning_rate": 5.34e-06, "elapsed_time_per_iteration": 6.41925883, "memory(GiB)": 21.46, "elapsed_time": "1m 58s", "remaining_time": "12h 16m 39s", "loss_scale": 1.0, "consumed_samples": 4352, "global_step/max_steps": "17/6362"} +{"lm loss": 7.24433374, "grad_norm": 9.1178503, "learning_rate": 5.66e-06, "elapsed_time_per_iteration": 6.67588091, "memory(GiB)": 21.46, "elapsed_time": "2m 5s", "remaining_time": "12h 14m 50s", "loss_scale": 1.0, "consumed_samples": 4608, "global_step/max_steps": "18/6362"} +{"lm loss": 7.22753477, "grad_norm": 7.23645973, "learning_rate": 5.97e-06, "elapsed_time_per_iteration": 6.43956685, "memory(GiB)": 21.46, "elapsed_time": "2m 11s", "remaining_time": "12h 11m 53s", "loss_scale": 1.0, "consumed_samples": 4864, "global_step/max_steps": "19/6362"} +{"lm loss": 7.19686747, "grad_norm": 5.9922061, "learning_rate": 6.29e-06, "elapsed_time_per_iteration": 6.47491002, "memory(GiB)": 21.46, "elapsed_time": "2m 18s", "remaining_time": "12h 9m 23s", "loss_scale": 1.0, "consumed_samples": 5120, "global_step/max_steps": "20/6362"} +{"lm loss": 7.15544367, "grad_norm": 5.21185589, "learning_rate": 6.6e-06, "elapsed_time_per_iteration": 6.40039706, "memory(GiB)": 21.46, "elapsed_time": "2m 24s", "remaining_time": "12h 6m 46s", "loss_scale": 1.0, "consumed_samples": 5376, "global_step/max_steps": "21/6362"} +{"lm loss": 7.10304928, "grad_norm": 5.24776459, "learning_rate": 6.92e-06, "elapsed_time_per_iteration": 6.5272131, "memory(GiB)": 21.46, "elapsed_time": "2m 30s", "remaining_time": "12h 4m 58s", "loss_scale": 1.0, "consumed_samples": 5632, "global_step/max_steps": "22/6362"} +{"lm loss": 7.0621891, "grad_norm": 5.66372681, "learning_rate": 7.23e-06, "elapsed_time_per_iteration": 6.34624267, "memory(GiB)": 21.46, "elapsed_time": "2m 37s", "remaining_time": "12h 2m 29s", "loss_scale": 1.0, "consumed_samples": 5888, "global_step/max_steps": "23/6362"} +{"lm loss": 7.00001097, "grad_norm": 5.11387062, "learning_rate": 7.54e-06, "elapsed_time_per_iteration": 6.58272386, "memory(GiB)": 21.46, "elapsed_time": "2m 43s", "remaining_time": "12h 1m 15s", "loss_scale": 1.0, "consumed_samples": 6144, "global_step/max_steps": "24/6362"} +{"lm loss": 6.97879696, "grad_norm": 4.12651014, "learning_rate": 7.86e-06, "elapsed_time_per_iteration": 6.58032751, "memory(GiB)": 21.46, "elapsed_time": "2m 50s", "remaining_time": "12h 0m 5s", "loss_scale": 1.0, "consumed_samples": 6400, "global_step/max_steps": "25/6362"} +{"lm loss": 6.93777752, "grad_norm": 3.6908946, "learning_rate": 8.17e-06, "elapsed_time_per_iteration": 6.55354357, "memory(GiB)": 21.46, "elapsed_time": "2m 57s", "remaining_time": "11h 58m 54s", "loss_scale": 1.0, "consumed_samples": 6656, "global_step/max_steps": "26/6362"} +{"lm loss": 6.928442, "grad_norm": 3.57184196, "learning_rate": 8.49e-06, "elapsed_time_per_iteration": 6.58023238, "memory(GiB)": 21.46, "elapsed_time": "3m 3s", "remaining_time": "11h 57m 54s", "loss_scale": 1.0, "consumed_samples": 6912, "global_step/max_steps": "27/6362"} +{"lm loss": 6.8830061, "grad_norm": 3.47382975, "learning_rate": 8.8e-06, "elapsed_time_per_iteration": 6.52972579, "memory(GiB)": 21.46, "elapsed_time": "3m 10s", "remaining_time": "11h 56m 46s", "loss_scale": 1.0, "consumed_samples": 7168, "global_step/max_steps": "28/6362"} +{"lm loss": 6.8644619, "grad_norm": 2.93933988, "learning_rate": 9.12e-06, "elapsed_time_per_iteration": 6.50750279, "memory(GiB)": 21.46, "elapsed_time": "3m 16s", "remaining_time": "11h 55m 37s", "loss_scale": 1.0, "consumed_samples": 7424, "global_step/max_steps": "29/6362"} +{"lm loss": 6.83335495, "grad_norm": 2.46695518, "learning_rate": 9.43e-06, "elapsed_time_per_iteration": 6.56970572, "memory(GiB)": 21.46, "elapsed_time": "3m 23s", "remaining_time": "11h 54m 46s", "loss_scale": 1.0, "consumed_samples": 7680, "global_step/max_steps": "30/6362"} +{"lm loss": 6.78757238, "grad_norm": 2.4937067, "learning_rate": 9.75e-06, "elapsed_time_per_iteration": 6.7971034, "memory(GiB)": 21.46, "elapsed_time": "3m 29s", "remaining_time": "11h 54m 44s", "loss_scale": 1.0, "consumed_samples": 7936, "global_step/max_steps": "31/6362"} +{"lm loss": 6.77665949, "grad_norm": 2.2796452, "learning_rate": 1.006e-05, "elapsed_time_per_iteration": 6.64160061, "memory(GiB)": 21.46, "elapsed_time": "3m 36s", "remaining_time": "11h 54m 12s", "loss_scale": 1.0, "consumed_samples": 8192, "global_step/max_steps": "32/6362"} +{"lm loss": 6.74401045, "grad_norm": 1.97581911, "learning_rate": 1.037e-05, "elapsed_time_per_iteration": 6.54741383, "memory(GiB)": 21.46, "elapsed_time": "3m 43s", "remaining_time": "11h 53m 22s", "loss_scale": 1.0, "consumed_samples": 8448, "global_step/max_steps": "33/6362"} +{"lm loss": 6.7163105, "grad_norm": 2.00601006, "learning_rate": 1.069e-05, "elapsed_time_per_iteration": 6.44126964, "memory(GiB)": 21.46, "elapsed_time": "3m 49s", "remaining_time": "11h 52m 16s", "loss_scale": 1.0, "consumed_samples": 8704, "global_step/max_steps": "34/6362"} +{"lm loss": 6.69175959, "grad_norm": 1.98527336, "learning_rate": 1.1e-05, "elapsed_time_per_iteration": 6.63253379, "memory(GiB)": 21.46, "elapsed_time": "3m 56s", "remaining_time": "11h 51m 47s", "loss_scale": 1.0, "consumed_samples": 8960, "global_step/max_steps": "35/6362"} +{"lm loss": 6.69707775, "grad_norm": 1.60245991, "learning_rate": 1.132e-05, "elapsed_time_per_iteration": 6.37686419, "memory(GiB)": 21.46, "elapsed_time": "4m 2s", "remaining_time": "11h 50m 35s", "loss_scale": 1.0, "consumed_samples": 9216, "global_step/max_steps": "36/6362"} +{"lm loss": 6.65065575, "grad_norm": 1.63, "learning_rate": 1.163e-05, "elapsed_time_per_iteration": 6.54022956, "memory(GiB)": 21.46, "elapsed_time": "4m 9s", "remaining_time": "11h 49m 54s", "loss_scale": 1.0, "consumed_samples": 9472, "global_step/max_steps": "37/6362"} +{"lm loss": 6.64140892, "grad_norm": 1.51741624, "learning_rate": 1.195e-05, "elapsed_time_per_iteration": 6.37877512, "memory(GiB)": 21.46, "elapsed_time": "4m 15s", "remaining_time": "11h 48m 48s", "loss_scale": 1.0, "consumed_samples": 9728, "global_step/max_steps": "38/6362"} +{"lm loss": 6.62205267, "grad_norm": 1.36325288, "learning_rate": 1.226e-05, "elapsed_time_per_iteration": 6.47770309, "memory(GiB)": 21.46, "elapsed_time": "4m 22s", "remaining_time": "11h 48m 1s", "loss_scale": 1.0, "consumed_samples": 9984, "global_step/max_steps": "39/6362"} +{"lm loss": 6.5893569, "grad_norm": 1.4069041, "learning_rate": 1.257e-05, "elapsed_time_per_iteration": 6.61148047, "memory(GiB)": 21.46, "elapsed_time": "4m 28s", "remaining_time": "11h 47m 37s", "loss_scale": 1.0, "consumed_samples": 10240, "global_step/max_steps": "40/6362"} +{"lm loss": 6.57607269, "grad_norm": 1.32189465, "learning_rate": 1.289e-05, "elapsed_time_per_iteration": 6.46166158, "memory(GiB)": 21.46, "elapsed_time": "4m 35s", "remaining_time": "11h 46m 51s", "loss_scale": 1.0, "consumed_samples": 10496, "global_step/max_steps": "41/6362"} +{"lm loss": 6.55448961, "grad_norm": 1.62171352, "learning_rate": 1.32e-05, "elapsed_time_per_iteration": 6.49326491, "memory(GiB)": 21.46, "elapsed_time": "4m 41s", "remaining_time": "11h 46m 12s", "loss_scale": 1.0, "consumed_samples": 10752, "global_step/max_steps": "42/6362"} +{"lm loss": 6.53595495, "grad_norm": 1.56051207, "learning_rate": 1.352e-05, "elapsed_time_per_iteration": 6.53402519, "memory(GiB)": 21.46, "elapsed_time": "4m 48s", "remaining_time": "11h 45m 40s", "loss_scale": 1.0, "consumed_samples": 11008, "global_step/max_steps": "43/6362"} +{"lm loss": 6.53967857, "grad_norm": 1.15629935, "learning_rate": 1.383e-05, "elapsed_time_per_iteration": 6.53041863, "memory(GiB)": 21.46, "elapsed_time": "4m 54s", "remaining_time": "11h 45m 9s", "loss_scale": 1.0, "consumed_samples": 11264, "global_step/max_steps": "44/6362"} +{"lm loss": 6.51162243, "grad_norm": 1.0911839, "learning_rate": 1.415e-05, "elapsed_time_per_iteration": 6.52400494, "memory(GiB)": 21.46, "elapsed_time": "5m 1s", "remaining_time": "11h 44m 38s", "loss_scale": 1.0, "consumed_samples": 11520, "global_step/max_steps": "45/6362"} +{"lm loss": 6.48286629, "grad_norm": 1.12286747, "learning_rate": 1.446e-05, "elapsed_time_per_iteration": 6.59208965, "memory(GiB)": 21.46, "elapsed_time": "5m 7s", "remaining_time": "11h 44m 18s", "loss_scale": 1.0, "consumed_samples": 11776, "global_step/max_steps": "46/6362"} +{"lm loss": 6.48368454, "grad_norm": 1.5940088, "learning_rate": 1.478e-05, "elapsed_time_per_iteration": 6.47638488, "memory(GiB)": 21.46, "elapsed_time": "5m 14s", "remaining_time": "11h 43m 42s", "loss_scale": 1.0, "consumed_samples": 12032, "global_step/max_steps": "47/6362"} +{"lm loss": 6.44723225, "grad_norm": 1.22692347, "learning_rate": 1.509e-05, "elapsed_time_per_iteration": 6.63181591, "memory(GiB)": 21.46, "elapsed_time": "5m 20s", "remaining_time": "11h 43m 29s", "loss_scale": 1.0, "consumed_samples": 12288, "global_step/max_steps": "48/6362"} +{"lm loss": 6.44457912, "grad_norm": 0.95368278, "learning_rate": 1.54e-05, "elapsed_time_per_iteration": 6.65710616, "memory(GiB)": 21.46, "elapsed_time": "5m 27s", "remaining_time": "11h 43m 18s", "loss_scale": 1.0, "consumed_samples": 12544, "global_step/max_steps": "49/6362"} +{"lm loss": 6.4476738, "grad_norm": 0.97935545, "learning_rate": 1.572e-05, "elapsed_time_per_iteration": 6.46537375, "memory(GiB)": 21.46, "elapsed_time": "5m 34s", "remaining_time": "11h 42m 44s", "loss_scale": 1.0, "consumed_samples": 12800, "global_step/max_steps": "50/6362"} +{"lm loss": 6.40624142, "grad_norm": 1.11043274, "learning_rate": 1.603e-05, "elapsed_time_per_iteration": 6.48869443, "memory(GiB)": 21.46, "elapsed_time": "5m 40s", "remaining_time": "11h 42m 14s", "loss_scale": 1.0, "consumed_samples": 13056, "global_step/max_steps": "51/6362"} +{"lm loss": 6.39625883, "grad_norm": 1.48771572, "learning_rate": 1.635e-05, "elapsed_time_per_iteration": 6.48598051, "memory(GiB)": 21.46, "elapsed_time": "5m 46s", "remaining_time": "11h 41m 44s", "loss_scale": 1.0, "consumed_samples": 13312, "global_step/max_steps": "52/6362"} +{"lm loss": 6.38954449, "grad_norm": 1.13521385, "learning_rate": 1.666e-05, "elapsed_time_per_iteration": 6.57583642, "memory(GiB)": 21.46, "elapsed_time": "5m 53s", "remaining_time": "11h 41m 26s", "loss_scale": 1.0, "consumed_samples": 13568, "global_step/max_steps": "53/6362"} +{"lm loss": 6.37996006, "grad_norm": 1.3415935, "learning_rate": 1.698e-05, "elapsed_time_per_iteration": 6.47658682, "memory(GiB)": 21.46, "elapsed_time": "6m 0s", "remaining_time": "11h 40m 56s", "loss_scale": 1.0, "consumed_samples": 13824, "global_step/max_steps": "54/6362"} +{"lm loss": 6.35142231, "grad_norm": 1.09428155, "learning_rate": 1.729e-05, "elapsed_time_per_iteration": 6.60098195, "memory(GiB)": 21.46, "elapsed_time": "6m 6s", "remaining_time": "11h 40m 42s", "loss_scale": 1.0, "consumed_samples": 14080, "global_step/max_steps": "55/6362"} +{"lm loss": 6.36211252, "grad_norm": 1.05406415, "learning_rate": 1.76e-05, "elapsed_time_per_iteration": 6.52003241, "memory(GiB)": 21.46, "elapsed_time": "6m 13s", "remaining_time": "11h 40m 19s", "loss_scale": 1.0, "consumed_samples": 14336, "global_step/max_steps": "56/6362"} +{"lm loss": 6.33269501, "grad_norm": 1.32021308, "learning_rate": 1.792e-05, "elapsed_time_per_iteration": 6.90489507, "memory(GiB)": 21.46, "elapsed_time": "6m 20s", "remaining_time": "11h 40m 39s", "loss_scale": 1.0, "consumed_samples": 14592, "global_step/max_steps": "57/6362"} +{"lm loss": 6.32445621, "grad_norm": 0.8882668, "learning_rate": 1.823e-05, "elapsed_time_per_iteration": 6.54515076, "memory(GiB)": 21.46, "elapsed_time": "6m 26s", "remaining_time": "11h 40m 19s", "loss_scale": 1.0, "consumed_samples": 14848, "global_step/max_steps": "58/6362"} +{"lm loss": 6.31141329, "grad_norm": 0.97411728, "learning_rate": 1.855e-05, "elapsed_time_per_iteration": 6.61010718, "memory(GiB)": 21.46, "elapsed_time": "6m 33s", "remaining_time": "11h 40m 6s", "loss_scale": 1.0, "consumed_samples": 15104, "global_step/max_steps": "59/6362"} +{"lm loss": 6.30953979, "grad_norm": 1.68780291, "learning_rate": 1.886e-05, "elapsed_time_per_iteration": 6.47443128, "memory(GiB)": 21.46, "elapsed_time": "6m 39s", "remaining_time": "11h 39m 40s", "loss_scale": 1.0, "consumed_samples": 15360, "global_step/max_steps": "60/6362"} +{"lm loss": 6.29541349, "grad_norm": 1.01609051, "learning_rate": 1.918e-05, "elapsed_time_per_iteration": 6.66868734, "memory(GiB)": 21.46, "elapsed_time": "6m 46s", "remaining_time": "11h 39m 34s", "loss_scale": 1.0, "consumed_samples": 15616, "global_step/max_steps": "61/6362"} +{"lm loss": 6.28349781, "grad_norm": 1.40218699, "learning_rate": 1.949e-05, "elapsed_time_per_iteration": 6.53269887, "memory(GiB)": 21.46, "elapsed_time": "6m 52s", "remaining_time": "11h 39m 14s", "loss_scale": 1.0, "consumed_samples": 15872, "global_step/max_steps": "62/6362"} +{"lm loss": 6.28215933, "grad_norm": 0.93296164, "learning_rate": 1.981e-05, "elapsed_time_per_iteration": 6.60805106, "memory(GiB)": 21.46, "elapsed_time": "6m 59s", "remaining_time": "11h 39m 2s", "loss_scale": 1.0, "consumed_samples": 16128, "global_step/max_steps": "63/6362"} +{"lm loss": 6.26364613, "grad_norm": 1.46417248, "learning_rate": 2.012e-05, "elapsed_time_per_iteration": 6.45374918, "memory(GiB)": 21.46, "elapsed_time": "7m 5s", "remaining_time": "11h 38m 35s", "loss_scale": 1.0, "consumed_samples": 16384, "global_step/max_steps": "64/6362"} +{"lm loss": 6.2584796, "grad_norm": 1.12005484, "learning_rate": 2.043e-05, "elapsed_time_per_iteration": 6.56267738, "memory(GiB)": 21.46, "elapsed_time": "7m 12s", "remaining_time": "11h 38m 20s", "loss_scale": 1.0, "consumed_samples": 16640, "global_step/max_steps": "65/6362"} +{"lm loss": 6.23186255, "grad_norm": 1.41627729, "learning_rate": 2.075e-05, "elapsed_time_per_iteration": 6.71550727, "memory(GiB)": 21.46, "elapsed_time": "7m 19s", "remaining_time": "11h 38m 19s", "loss_scale": 1.0, "consumed_samples": 16896, "global_step/max_steps": "66/6362"} +{"lm loss": 6.25570297, "grad_norm": 1.54353654, "learning_rate": 2.106e-05, "elapsed_time_per_iteration": 6.67580724, "memory(GiB)": 21.46, "elapsed_time": "7m 25s", "remaining_time": "11h 38m 14s", "loss_scale": 1.0, "consumed_samples": 17152, "global_step/max_steps": "67/6362"} +{"lm loss": 6.23278713, "grad_norm": 1.18713582, "learning_rate": 2.138e-05, "elapsed_time_per_iteration": 6.68989897, "memory(GiB)": 21.46, "elapsed_time": "7m 32s", "remaining_time": "11h 38m 11s", "loss_scale": 1.0, "consumed_samples": 17408, "global_step/max_steps": "68/6362"} +{"lm loss": 6.23396158, "grad_norm": 1.58858073, "learning_rate": 2.169e-05, "elapsed_time_per_iteration": 6.68483591, "memory(GiB)": 21.46, "elapsed_time": "7m 39s", "remaining_time": "11h 38m 7s", "loss_scale": 1.0, "consumed_samples": 17664, "global_step/max_steps": "69/6362"} +{"lm loss": 6.2111783, "grad_norm": 1.14316428, "learning_rate": 2.201e-05, "elapsed_time_per_iteration": 6.7366786, "memory(GiB)": 21.46, "elapsed_time": "7m 46s", "remaining_time": "11h 38m 7s", "loss_scale": 1.0, "consumed_samples": 17920, "global_step/max_steps": "70/6362"} +{"lm loss": 6.22076178, "grad_norm": 1.38245499, "learning_rate": 2.232e-05, "elapsed_time_per_iteration": 6.46115899, "memory(GiB)": 21.46, "elapsed_time": "7m 52s", "remaining_time": "11h 37m 43s", "loss_scale": 1.0, "consumed_samples": 18176, "global_step/max_steps": "71/6362"} +{"lm loss": 6.19743395, "grad_norm": 1.2576257, "learning_rate": 2.263e-05, "elapsed_time_per_iteration": 6.51851821, "memory(GiB)": 21.46, "elapsed_time": "7m 58s", "remaining_time": "11h 37m 25s", "loss_scale": 1.0, "consumed_samples": 18432, "global_step/max_steps": "72/6362"} +{"lm loss": 6.19747686, "grad_norm": 1.61565483, "learning_rate": 2.295e-05, "elapsed_time_per_iteration": 6.57519436, "memory(GiB)": 21.46, "elapsed_time": "8m 5s", "remaining_time": "11h 37m 12s", "loss_scale": 1.0, "consumed_samples": 18688, "global_step/max_steps": "73/6362"} +{"lm loss": 6.1917944, "grad_norm": 1.09578526, "learning_rate": 2.326e-05, "elapsed_time_per_iteration": 6.48188472, "memory(GiB)": 21.46, "elapsed_time": "8m 12s", "remaining_time": "11h 36m 50s", "loss_scale": 1.0, "consumed_samples": 18944, "global_step/max_steps": "74/6362"} +{"lm loss": 6.20222139, "grad_norm": 1.23850787, "learning_rate": 2.358e-05, "elapsed_time_per_iteration": 6.6156888, "memory(GiB)": 21.46, "elapsed_time": "8m 18s", "remaining_time": "11h 36m 41s", "loss_scale": 1.0, "consumed_samples": 19200, "global_step/max_steps": "75/6362"} +{"lm loss": 6.1716814, "grad_norm": 1.19957805, "learning_rate": 2.389e-05, "elapsed_time_per_iteration": 6.65580392, "memory(GiB)": 21.46, "elapsed_time": "8m 25s", "remaining_time": "11h 36m 35s", "loss_scale": 1.0, "consumed_samples": 19456, "global_step/max_steps": "76/6362"} +{"lm loss": 6.17572832, "grad_norm": 1.7924974, "learning_rate": 2.421e-05, "elapsed_time_per_iteration": 6.64748526, "memory(GiB)": 21.46, "elapsed_time": "8m 31s", "remaining_time": "11h 36m 28s", "loss_scale": 1.0, "consumed_samples": 19712, "global_step/max_steps": "77/6362"} +{"lm loss": 6.18005133, "grad_norm": 1.10561323, "learning_rate": 2.452e-05, "elapsed_time_per_iteration": 6.51205039, "memory(GiB)": 21.46, "elapsed_time": "8m 38s", "remaining_time": "11h 36m 10s", "loss_scale": 1.0, "consumed_samples": 19968, "global_step/max_steps": "78/6362"} +{"lm loss": 6.16462564, "grad_norm": 1.30248475, "learning_rate": 2.483e-05, "elapsed_time_per_iteration": 6.52810955, "memory(GiB)": 21.46, "elapsed_time": "8m 45s", "remaining_time": "11h 35m 54s", "loss_scale": 1.0, "consumed_samples": 20224, "global_step/max_steps": "79/6362"} +{"lm loss": 6.15736198, "grad_norm": 1.26142669, "learning_rate": 2.515e-05, "elapsed_time_per_iteration": 6.98518467, "memory(GiB)": 21.46, "elapsed_time": "8m 51s", "remaining_time": "11h 36m 14s", "loss_scale": 1.0, "consumed_samples": 20480, "global_step/max_steps": "80/6362"} +{"lm loss": 6.16006041, "grad_norm": 1.30265617, "learning_rate": 2.546e-05, "elapsed_time_per_iteration": 6.56417656, "memory(GiB)": 21.46, "elapsed_time": "8m 58s", "remaining_time": "11h 36m 1s", "loss_scale": 1.0, "consumed_samples": 20736, "global_step/max_steps": "81/6362"} +{"lm loss": 6.14295721, "grad_norm": 1.69387579, "learning_rate": 2.578e-05, "elapsed_time_per_iteration": 6.64091754, "memory(GiB)": 21.46, "elapsed_time": "9m 5s", "remaining_time": "11h 35m 54s", "loss_scale": 1.0, "consumed_samples": 20992, "global_step/max_steps": "82/6362"} +{"lm loss": 6.13536549, "grad_norm": 1.44444513, "learning_rate": 2.609e-05, "elapsed_time_per_iteration": 6.52486467, "memory(GiB)": 21.46, "elapsed_time": "9m 11s", "remaining_time": "11h 35m 38s", "loss_scale": 1.0, "consumed_samples": 21248, "global_step/max_steps": "83/6362"} +{"lm loss": 6.11806822, "grad_norm": 1.63234055, "learning_rate": 2.641e-05, "elapsed_time_per_iteration": 6.60365391, "memory(GiB)": 21.46, "elapsed_time": "9m 18s", "remaining_time": "11h 35m 28s", "loss_scale": 1.0, "consumed_samples": 21504, "global_step/max_steps": "84/6362"} +{"lm loss": 6.11807871, "grad_norm": 1.49178922, "learning_rate": 2.672e-05, "elapsed_time_per_iteration": 6.75784254, "memory(GiB)": 21.46, "elapsed_time": "9m 25s", "remaining_time": "11h 35m 29s", "loss_scale": 1.0, "consumed_samples": 21760, "global_step/max_steps": "85/6362"} +{"lm loss": 6.12817955, "grad_norm": 1.66702378, "learning_rate": 2.704e-05, "elapsed_time_per_iteration": 6.43276381, "memory(GiB)": 21.46, "elapsed_time": "9m 31s", "remaining_time": "11h 35m 7s", "loss_scale": 1.0, "consumed_samples": 22016, "global_step/max_steps": "86/6362"} +{"lm loss": 6.12206125, "grad_norm": 1.33530152, "learning_rate": 2.735e-05, "elapsed_time_per_iteration": 6.57642078, "memory(GiB)": 21.46, "elapsed_time": "9m 38s", "remaining_time": "11h 34m 55s", "loss_scale": 1.0, "consumed_samples": 22272, "global_step/max_steps": "87/6362"} +{"lm loss": 6.10201693, "grad_norm": 1.36173844, "learning_rate": 2.766e-05, "elapsed_time_per_iteration": 6.47289038, "memory(GiB)": 21.46, "elapsed_time": "9m 44s", "remaining_time": "11h 34m 37s", "loss_scale": 1.0, "consumed_samples": 22528, "global_step/max_steps": "88/6362"} +{"lm loss": 6.09196186, "grad_norm": 1.90027869, "learning_rate": 2.798e-05, "elapsed_time_per_iteration": 6.80032873, "memory(GiB)": 21.46, "elapsed_time": "9m 51s", "remaining_time": "11h 34m 41s", "loss_scale": 1.0, "consumed_samples": 22784, "global_step/max_steps": "89/6362"} +{"lm loss": 6.1129818, "grad_norm": 1.37496412, "learning_rate": 2.829e-05, "elapsed_time_per_iteration": 6.63593578, "memory(GiB)": 21.46, "elapsed_time": "9m 58s", "remaining_time": "11h 34m 34s", "loss_scale": 1.0, "consumed_samples": 23040, "global_step/max_steps": "90/6362"} +{"lm loss": 6.11775017, "grad_norm": 1.98401165, "learning_rate": 2.861e-05, "elapsed_time_per_iteration": 6.39754272, "memory(GiB)": 21.46, "elapsed_time": "10m 4s", "remaining_time": "11h 34m 10s", "loss_scale": 1.0, "consumed_samples": 23296, "global_step/max_steps": "91/6362"} +{"lm loss": 6.09424448, "grad_norm": 1.13301384, "learning_rate": 2.892e-05, "elapsed_time_per_iteration": 6.36843705, "memory(GiB)": 21.46, "elapsed_time": "10m 10s", "remaining_time": "11h 33m 45s", "loss_scale": 1.0, "consumed_samples": 23552, "global_step/max_steps": "92/6362"} +{"lm loss": 6.09323978, "grad_norm": 1.96932292, "learning_rate": 2.924e-05, "elapsed_time_per_iteration": 6.58081889, "memory(GiB)": 21.46, "elapsed_time": "10m 17s", "remaining_time": "11h 33m 34s", "loss_scale": 1.0, "consumed_samples": 23808, "global_step/max_steps": "93/6362"} +{"lm loss": 6.0847106, "grad_norm": 1.5015322, "learning_rate": 2.955e-05, "elapsed_time_per_iteration": 6.41746926, "memory(GiB)": 21.46, "elapsed_time": "10m 23s", "remaining_time": "11h 33m 13s", "loss_scale": 1.0, "consumed_samples": 24064, "global_step/max_steps": "94/6362"} +{"lm loss": 6.07287645, "grad_norm": 1.6706053, "learning_rate": 2.986e-05, "elapsed_time_per_iteration": 6.60619617, "memory(GiB)": 21.46, "elapsed_time": "10m 30s", "remaining_time": "11h 33m 4s", "loss_scale": 1.0, "consumed_samples": 24320, "global_step/max_steps": "95/6362"} +{"lm loss": 6.07894659, "grad_norm": 1.62379277, "learning_rate": 3.018e-05, "elapsed_time_per_iteration": 6.52940607, "memory(GiB)": 21.46, "elapsed_time": "10m 36s", "remaining_time": "11h 32m 51s", "loss_scale": 1.0, "consumed_samples": 24576, "global_step/max_steps": "96/6362"} +{"lm loss": 6.07883263, "grad_norm": 1.69387245, "learning_rate": 3.049e-05, "elapsed_time_per_iteration": 6.47134948, "memory(GiB)": 21.46, "elapsed_time": "10m 43s", "remaining_time": "11h 32m 34s", "loss_scale": 1.0, "consumed_samples": 24832, "global_step/max_steps": "97/6362"} +{"lm loss": 6.08056068, "grad_norm": 1.99241686, "learning_rate": 3.081e-05, "elapsed_time_per_iteration": 6.59061074, "memory(GiB)": 21.46, "elapsed_time": "10m 49s", "remaining_time": "11h 32m 24s", "loss_scale": 1.0, "consumed_samples": 25088, "global_step/max_steps": "98/6362"} +{"lm loss": 6.04591036, "grad_norm": 1.15356469, "learning_rate": 3.112e-05, "elapsed_time_per_iteration": 6.60288429, "memory(GiB)": 21.46, "elapsed_time": "10m 56s", "remaining_time": "11h 32m 16s", "loss_scale": 1.0, "consumed_samples": 25344, "global_step/max_steps": "99/6362"} +{"lm loss": 6.06128073, "grad_norm": 1.62104976, "learning_rate": 3.144e-05, "elapsed_time_per_iteration": 6.2699132, "memory(GiB)": 21.46, "elapsed_time": "11m 2s", "remaining_time": "11h 31m 46s", "loss_scale": 1.0, "consumed_samples": 25600, "global_step/max_steps": "100/6362"} +{"lm loss": 6.06896973, "grad_norm": 2.38145709, "learning_rate": 3.175e-05, "elapsed_time_per_iteration": 6.43000293, "memory(GiB)": 21.46, "elapsed_time": "11m 9s", "remaining_time": "11h 31m 28s", "loss_scale": 1.0, "consumed_samples": 25856, "global_step/max_steps": "101/6362"} +{"lm loss": 6.03172731, "grad_norm": 1.34509623, "learning_rate": 3.207e-05, "elapsed_time_per_iteration": 6.44194055, "memory(GiB)": 21.46, "elapsed_time": "11m 15s", "remaining_time": "11h 31m 10s", "loss_scale": 1.0, "consumed_samples": 26112, "global_step/max_steps": "102/6362"} +{"lm loss": 6.05245399, "grad_norm": 2.23323703, "learning_rate": 3.238e-05, "elapsed_time_per_iteration": 6.37675285, "memory(GiB)": 21.46, "elapsed_time": "11m 22s", "remaining_time": "11h 30m 48s", "loss_scale": 1.0, "consumed_samples": 26368, "global_step/max_steps": "103/6362"} +{"lm loss": 6.05451155, "grad_norm": 1.91640222, "learning_rate": 3.269e-05, "elapsed_time_per_iteration": 6.56499171, "memory(GiB)": 21.46, "elapsed_time": "11m 28s", "remaining_time": "11h 30m 38s", "loss_scale": 1.0, "consumed_samples": 26624, "global_step/max_steps": "104/6362"} +{"lm loss": 6.02834129, "grad_norm": 1.46306193, "learning_rate": 3.301e-05, "elapsed_time_per_iteration": 6.35174203, "memory(GiB)": 21.46, "elapsed_time": "11m 35s", "remaining_time": "11h 30m 15s", "loss_scale": 1.0, "consumed_samples": 26880, "global_step/max_steps": "105/6362"} +{"lm loss": 6.03012609, "grad_norm": 1.79045057, "learning_rate": 3.332e-05, "elapsed_time_per_iteration": 6.42535877, "memory(GiB)": 21.46, "elapsed_time": "11m 41s", "remaining_time": "11h 29m 57s", "loss_scale": 1.0, "consumed_samples": 27136, "global_step/max_steps": "106/6362"} +{"lm loss": 6.03042078, "grad_norm": 1.75783801, "learning_rate": 3.364e-05, "elapsed_time_per_iteration": 6.47473454, "memory(GiB)": 21.46, "elapsed_time": "11m 47s", "remaining_time": "11h 29m 42s", "loss_scale": 1.0, "consumed_samples": 27392, "global_step/max_steps": "107/6362"} +{"lm loss": 6.05291891, "grad_norm": 1.6484381, "learning_rate": 3.395e-05, "elapsed_time_per_iteration": 6.45607495, "memory(GiB)": 21.46, "elapsed_time": "11m 54s", "remaining_time": "11h 29m 26s", "loss_scale": 1.0, "consumed_samples": 27648, "global_step/max_steps": "108/6362"} +{"lm loss": 6.02772093, "grad_norm": 2.26094079, "learning_rate": 3.427e-05, "elapsed_time_per_iteration": 6.71141481, "memory(GiB)": 21.46, "elapsed_time": "12m 1s", "remaining_time": "11h 29m 25s", "loss_scale": 1.0, "consumed_samples": 27904, "global_step/max_steps": "109/6362"} +{"lm loss": 6.02048397, "grad_norm": 1.60768712, "learning_rate": 3.458e-05, "elapsed_time_per_iteration": 6.71809196, "memory(GiB)": 21.46, "elapsed_time": "12m 7s", "remaining_time": "11h 29m 24s", "loss_scale": 1.0, "consumed_samples": 28160, "global_step/max_steps": "110/6362"} +{"lm loss": 6.01871157, "grad_norm": 1.49303699, "learning_rate": 3.489e-05, "elapsed_time_per_iteration": 6.52514315, "memory(GiB)": 21.46, "elapsed_time": "12m 14s", "remaining_time": "11h 29m 13s", "loss_scale": 1.0, "consumed_samples": 28416, "global_step/max_steps": "111/6362"} +{"lm loss": 5.99633121, "grad_norm": 1.88422394, "learning_rate": 3.521e-05, "elapsed_time_per_iteration": 6.65772462, "memory(GiB)": 21.46, "elapsed_time": "12m 20s", "remaining_time": "11h 29m 8s", "loss_scale": 1.0, "consumed_samples": 28672, "global_step/max_steps": "112/6362"} +{"lm loss": 6.00318289, "grad_norm": 1.34047914, "learning_rate": 3.552e-05, "elapsed_time_per_iteration": 6.53853297, "memory(GiB)": 21.46, "elapsed_time": "12m 27s", "remaining_time": "11h 28m 58s", "loss_scale": 1.0, "consumed_samples": 28928, "global_step/max_steps": "113/6362"} +{"lm loss": 6.01368237, "grad_norm": 2.05009508, "learning_rate": 3.584e-05, "elapsed_time_per_iteration": 6.42418861, "memory(GiB)": 21.46, "elapsed_time": "12m 33s", "remaining_time": "11h 28m 40s", "loss_scale": 1.0, "consumed_samples": 29184, "global_step/max_steps": "114/6362"} +{"lm loss": 5.99993706, "grad_norm": 1.4623512, "learning_rate": 3.615e-05, "elapsed_time_per_iteration": 6.73300076, "memory(GiB)": 21.46, "elapsed_time": "12m 40s", "remaining_time": "11h 28m 40s", "loss_scale": 1.0, "consumed_samples": 29440, "global_step/max_steps": "115/6362"} +{"lm loss": 5.99773407, "grad_norm": 1.71884465, "learning_rate": 3.647e-05, "elapsed_time_per_iteration": 6.63828659, "memory(GiB)": 21.46, "elapsed_time": "12m 47s", "remaining_time": "11h 28m 35s", "loss_scale": 1.0, "consumed_samples": 29696, "global_step/max_steps": "116/6362"} +{"lm loss": 5.99026632, "grad_norm": 1.39715195, "learning_rate": 3.678e-05, "elapsed_time_per_iteration": 6.61137128, "memory(GiB)": 21.46, "elapsed_time": "12m 53s", "remaining_time": "11h 28m 28s", "loss_scale": 1.0, "consumed_samples": 29952, "global_step/max_steps": "117/6362"} +{"lm loss": 5.98576593, "grad_norm": 1.99112499, "learning_rate": 3.71e-05, "elapsed_time_per_iteration": 6.66001987, "memory(GiB)": 21.46, "elapsed_time": "13m 0s", "remaining_time": "11h 28m 24s", "loss_scale": 1.0, "consumed_samples": 30208, "global_step/max_steps": "118/6362"} +{"lm loss": 5.98327303, "grad_norm": 1.6877178, "learning_rate": 3.741e-05, "elapsed_time_per_iteration": 6.45539474, "memory(GiB)": 21.46, "elapsed_time": "13m 7s", "remaining_time": "11h 28m 9s", "loss_scale": 1.0, "consumed_samples": 30464, "global_step/max_steps": "119/6362"} +{"lm loss": 6.00612307, "grad_norm": 2.03146386, "learning_rate": 3.772e-05, "elapsed_time_per_iteration": 6.43478799, "memory(GiB)": 21.46, "elapsed_time": "13m 13s", "remaining_time": "11h 27m 53s", "loss_scale": 1.0, "consumed_samples": 30720, "global_step/max_steps": "120/6362"} +{"lm loss": 5.9782486, "grad_norm": 2.41604662, "learning_rate": 3.804e-05, "elapsed_time_per_iteration": 6.53622651, "memory(GiB)": 21.46, "elapsed_time": "13m 20s", "remaining_time": "11h 27m 43s", "loss_scale": 1.0, "consumed_samples": 30976, "global_step/max_steps": "121/6362"} +{"lm loss": 5.99407768, "grad_norm": 1.64103436, "learning_rate": 3.835e-05, "elapsed_time_per_iteration": 6.70706224, "memory(GiB)": 21.46, "elapsed_time": "13m 26s", "remaining_time": "11h 27m 41s", "loss_scale": 1.0, "consumed_samples": 31232, "global_step/max_steps": "122/6362"} +{"lm loss": 5.96244335, "grad_norm": 1.54052305, "learning_rate": 3.867e-05, "elapsed_time_per_iteration": 6.63435483, "memory(GiB)": 21.46, "elapsed_time": "13m 33s", "remaining_time": "11h 27m 35s", "loss_scale": 1.0, "consumed_samples": 31488, "global_step/max_steps": "123/6362"} +{"lm loss": 5.96701574, "grad_norm": 1.65500581, "learning_rate": 3.898e-05, "elapsed_time_per_iteration": 6.61545277, "memory(GiB)": 21.46, "elapsed_time": "13m 39s", "remaining_time": "11h 27m 29s", "loss_scale": 1.0, "consumed_samples": 31744, "global_step/max_steps": "124/6362"} +{"lm loss": 5.96691656, "grad_norm": 2.13565016, "learning_rate": 3.93e-05, "elapsed_time_per_iteration": 6.60386443, "memory(GiB)": 21.46, "elapsed_time": "13m 46s", "remaining_time": "11h 27m 22s", "loss_scale": 1.0, "consumed_samples": 32000, "global_step/max_steps": "125/6362"} +{"lm loss": 5.94509172, "grad_norm": 1.67036271, "learning_rate": 3.961e-05, "elapsed_time_per_iteration": 6.61689425, "memory(GiB)": 21.46, "elapsed_time": "13m 53s", "remaining_time": "11h 27m 15s", "loss_scale": 1.0, "consumed_samples": 32256, "global_step/max_steps": "126/6362"} +{"lm loss": 5.9523797, "grad_norm": 2.58230042, "learning_rate": 3.992e-05, "elapsed_time_per_iteration": 6.53730583, "memory(GiB)": 21.46, "elapsed_time": "13m 59s", "remaining_time": "11h 27m 5s", "loss_scale": 1.0, "consumed_samples": 32512, "global_step/max_steps": "127/6362"} +{"lm loss": 5.95417881, "grad_norm": 1.37406874, "learning_rate": 4.024e-05, "elapsed_time_per_iteration": 6.52321029, "memory(GiB)": 21.46, "elapsed_time": "14m 6s", "remaining_time": "11h 26m 54s", "loss_scale": 1.0, "consumed_samples": 32768, "global_step/max_steps": "128/6362"} +{"lm loss": 5.97114754, "grad_norm": 3.01327372, "learning_rate": 4.055e-05, "elapsed_time_per_iteration": 6.55342507, "memory(GiB)": 21.46, "elapsed_time": "14m 12s", "remaining_time": "11h 26m 45s", "loss_scale": 1.0, "consumed_samples": 33024, "global_step/max_steps": "129/6362"} +{"lm loss": 5.95517731, "grad_norm": 1.80659676, "learning_rate": 4.087e-05, "elapsed_time_per_iteration": 6.37661314, "memory(GiB)": 21.46, "elapsed_time": "14m 19s", "remaining_time": "11h 26m 27s", "loss_scale": 1.0, "consumed_samples": 33280, "global_step/max_steps": "130/6362"} +{"lm loss": 5.94169235, "grad_norm": 2.55844545, "learning_rate": 4.118e-05, "elapsed_time_per_iteration": 6.57108045, "memory(GiB)": 21.46, "elapsed_time": "14m 25s", "remaining_time": "11h 26m 19s", "loss_scale": 1.0, "consumed_samples": 33536, "global_step/max_steps": "131/6362"} +{"lm loss": 5.95726919, "grad_norm": 2.18657446, "learning_rate": 4.15e-05, "elapsed_time_per_iteration": 6.46464491, "memory(GiB)": 21.46, "elapsed_time": "14m 32s", "remaining_time": "11h 26m 5s", "loss_scale": 1.0, "consumed_samples": 33792, "global_step/max_steps": "132/6362"} +{"lm loss": 5.9333148, "grad_norm": 1.94223964, "learning_rate": 4.181e-05, "elapsed_time_per_iteration": 6.65820003, "memory(GiB)": 21.46, "elapsed_time": "14m 38s", "remaining_time": "11h 26m 1s", "loss_scale": 1.0, "consumed_samples": 34048, "global_step/max_steps": "133/6362"} +{"lm loss": 5.94081974, "grad_norm": 1.62595451, "learning_rate": 4.213e-05, "elapsed_time_per_iteration": 6.3905983, "memory(GiB)": 21.46, "elapsed_time": "14m 45s", "remaining_time": "11h 25m 44s", "loss_scale": 1.0, "consumed_samples": 34304, "global_step/max_steps": "134/6362"} +{"lm loss": 5.94571352, "grad_norm": 1.95931804, "learning_rate": 4.244e-05, "elapsed_time_per_iteration": 6.67690611, "memory(GiB)": 21.46, "elapsed_time": "14m 51s", "remaining_time": "11h 25m 41s", "loss_scale": 1.0, "consumed_samples": 34560, "global_step/max_steps": "135/6362"} +{"lm loss": 5.94105005, "grad_norm": 2.15043831, "learning_rate": 4.275e-05, "elapsed_time_per_iteration": 6.70936131, "memory(GiB)": 21.46, "elapsed_time": "14m 58s", "remaining_time": "11h 25m 39s", "loss_scale": 1.0, "consumed_samples": 34816, "global_step/max_steps": "136/6362"} +{"lm loss": 5.92560148, "grad_norm": 2.49999332, "learning_rate": 4.307e-05, "elapsed_time_per_iteration": 6.61829543, "memory(GiB)": 21.46, "elapsed_time": "15m 5s", "remaining_time": "11h 25m 33s", "loss_scale": 1.0, "consumed_samples": 35072, "global_step/max_steps": "137/6362"} +{"lm loss": 5.92821312, "grad_norm": 2.08556223, "learning_rate": 4.338e-05, "elapsed_time_per_iteration": 6.50004268, "memory(GiB)": 21.46, "elapsed_time": "15m 11s", "remaining_time": "11h 25m 21s", "loss_scale": 1.0, "consumed_samples": 35328, "global_step/max_steps": "138/6362"} +{"lm loss": 5.90252829, "grad_norm": 2.0011797, "learning_rate": 4.37e-05, "elapsed_time_per_iteration": 6.33973217, "memory(GiB)": 21.46, "elapsed_time": "15m 18s", "remaining_time": "11h 25m 3s", "loss_scale": 1.0, "consumed_samples": 35584, "global_step/max_steps": "139/6362"} +{"lm loss": 5.91297007, "grad_norm": 2.07380509, "learning_rate": 4.401e-05, "elapsed_time_per_iteration": 6.48971152, "memory(GiB)": 21.46, "elapsed_time": "15m 24s", "remaining_time": "11h 24m 51s", "loss_scale": 1.0, "consumed_samples": 35840, "global_step/max_steps": "140/6362"} +{"lm loss": 5.90713882, "grad_norm": 1.95418143, "learning_rate": 4.433e-05, "elapsed_time_per_iteration": 6.53335857, "memory(GiB)": 21.46, "elapsed_time": "15m 31s", "remaining_time": "11h 24m 41s", "loss_scale": 1.0, "consumed_samples": 36096, "global_step/max_steps": "141/6362"} +{"lm loss": 5.91098738, "grad_norm": 2.5976367, "learning_rate": 4.464e-05, "elapsed_time_per_iteration": 6.60083866, "memory(GiB)": 21.46, "elapsed_time": "15m 37s", "remaining_time": "11h 24m 35s", "loss_scale": 1.0, "consumed_samples": 36352, "global_step/max_steps": "142/6362"} +{"lm loss": 5.90207481, "grad_norm": 1.62534225, "learning_rate": 4.495e-05, "elapsed_time_per_iteration": 6.48534203, "memory(GiB)": 21.46, "elapsed_time": "15m 44s", "remaining_time": "11h 24m 23s", "loss_scale": 1.0, "consumed_samples": 36608, "global_step/max_steps": "143/6362"} +{"lm loss": 5.90339708, "grad_norm": 2.11716866, "learning_rate": 4.527e-05, "elapsed_time_per_iteration": 6.59004402, "memory(GiB)": 21.46, "elapsed_time": "15m 50s", "remaining_time": "11h 24m 16s", "loss_scale": 1.0, "consumed_samples": 36864, "global_step/max_steps": "144/6362"} +{"lm loss": 5.91547441, "grad_norm": 2.02942753, "learning_rate": 4.558e-05, "elapsed_time_per_iteration": 6.33852005, "memory(GiB)": 21.46, "elapsed_time": "15m 57s", "remaining_time": "11h 23m 58s", "loss_scale": 1.0, "consumed_samples": 37120, "global_step/max_steps": "145/6362"} +{"lm loss": 5.89954138, "grad_norm": 2.09768939, "learning_rate": 4.59e-05, "elapsed_time_per_iteration": 6.45846891, "memory(GiB)": 21.46, "elapsed_time": "16m 3s", "remaining_time": "11h 23m 45s", "loss_scale": 1.0, "consumed_samples": 37376, "global_step/max_steps": "146/6362"} +{"lm loss": 5.89733601, "grad_norm": 2.0106287, "learning_rate": 4.621e-05, "elapsed_time_per_iteration": 6.87703919, "memory(GiB)": 21.46, "elapsed_time": "16m 10s", "remaining_time": "11h 23m 50s", "loss_scale": 1.0, "consumed_samples": 37632, "global_step/max_steps": "147/6362"} +{"lm loss": 5.89778137, "grad_norm": 2.37041903, "learning_rate": 4.653e-05, "elapsed_time_per_iteration": 6.22046185, "memory(GiB)": 21.46, "elapsed_time": "16m 16s", "remaining_time": "11h 23m 28s", "loss_scale": 1.0, "consumed_samples": 37888, "global_step/max_steps": "148/6362"} +{"lm loss": 5.88977718, "grad_norm": 2.03430939, "learning_rate": 4.684e-05, "elapsed_time_per_iteration": 6.42741179, "memory(GiB)": 21.46, "elapsed_time": "16m 23s", "remaining_time": "11h 23m 14s", "loss_scale": 1.0, "consumed_samples": 38144, "global_step/max_steps": "149/6362"} +{"lm loss": 5.89744425, "grad_norm": 1.87363219, "learning_rate": 4.715e-05, "elapsed_time_per_iteration": 6.47205377, "memory(GiB)": 21.46, "elapsed_time": "16m 29s", "remaining_time": "11h 23m 2s", "loss_scale": 1.0, "consumed_samples": 38400, "global_step/max_steps": "150/6362"} +{"lm loss": 5.89311409, "grad_norm": 2.28692269, "learning_rate": 4.747e-05, "elapsed_time_per_iteration": 6.39381075, "memory(GiB)": 21.46, "elapsed_time": "16m 35s", "remaining_time": "11h 22m 47s", "loss_scale": 1.0, "consumed_samples": 38656, "global_step/max_steps": "151/6362"} +{"lm loss": 5.89341545, "grad_norm": 1.55980992, "learning_rate": 4.778e-05, "elapsed_time_per_iteration": 6.39192462, "memory(GiB)": 21.46, "elapsed_time": "16m 42s", "remaining_time": "11h 22m 32s", "loss_scale": 1.0, "consumed_samples": 38912, "global_step/max_steps": "152/6362"} +{"lm loss": 5.89275551, "grad_norm": 2.70719957, "learning_rate": 4.81e-05, "elapsed_time_per_iteration": 6.65648818, "memory(GiB)": 21.46, "elapsed_time": "16m 49s", "remaining_time": "11h 22m 28s", "loss_scale": 1.0, "consumed_samples": 39168, "global_step/max_steps": "153/6362"} +{"lm loss": 5.87317514, "grad_norm": 1.40889037, "learning_rate": 4.841e-05, "elapsed_time_per_iteration": 6.57194376, "memory(GiB)": 21.46, "elapsed_time": "16m 55s", "remaining_time": "11h 22m 20s", "loss_scale": 1.0, "consumed_samples": 39424, "global_step/max_steps": "154/6362"} +{"lm loss": 5.89601183, "grad_norm": 2.48825145, "learning_rate": 4.873e-05, "elapsed_time_per_iteration": 6.56072545, "memory(GiB)": 21.46, "elapsed_time": "17m 2s", "remaining_time": "11h 22m 12s", "loss_scale": 1.0, "consumed_samples": 39680, "global_step/max_steps": "155/6362"} +{"lm loss": 5.88734293, "grad_norm": 1.80317628, "learning_rate": 4.904e-05, "elapsed_time_per_iteration": 6.57311153, "memory(GiB)": 21.46, "elapsed_time": "17m 8s", "remaining_time": "11h 22m 5s", "loss_scale": 1.0, "consumed_samples": 39936, "global_step/max_steps": "156/6362"} +{"lm loss": 5.89002371, "grad_norm": 2.3109405, "learning_rate": 4.936e-05, "elapsed_time_per_iteration": 6.69658518, "memory(GiB)": 21.46, "elapsed_time": "17m 15s", "remaining_time": "11h 22m 2s", "loss_scale": 1.0, "consumed_samples": 40192, "global_step/max_steps": "157/6362"} +{"lm loss": 5.86313152, "grad_norm": 1.97751546, "learning_rate": 4.967e-05, "elapsed_time_per_iteration": 6.60258222, "memory(GiB)": 21.46, "elapsed_time": "17m 22s", "remaining_time": "11h 21m 56s", "loss_scale": 1.0, "consumed_samples": 40448, "global_step/max_steps": "158/6362"} +{"lm loss": 5.87227964, "grad_norm": 2.25152111, "learning_rate": 4.998e-05, "elapsed_time_per_iteration": 6.61240697, "memory(GiB)": 21.46, "elapsed_time": "17m 28s", "remaining_time": "11h 21m 50s", "loss_scale": 1.0, "consumed_samples": 40704, "global_step/max_steps": "159/6362"} +{"lm loss": 5.8675437, "grad_norm": 1.93877769, "learning_rate": 5.03e-05, "elapsed_time_per_iteration": 6.37137485, "memory(GiB)": 21.46, "elapsed_time": "17m 35s", "remaining_time": "11h 21m 35s", "loss_scale": 1.0, "consumed_samples": 40960, "global_step/max_steps": "160/6362"} +{"lm loss": 5.8643589, "grad_norm": 2.39113283, "learning_rate": 5.061e-05, "elapsed_time_per_iteration": 6.47858953, "memory(GiB)": 21.46, "elapsed_time": "17m 41s", "remaining_time": "11h 21m 24s", "loss_scale": 1.0, "consumed_samples": 41216, "global_step/max_steps": "161/6362"} +{"lm loss": 5.88584089, "grad_norm": 2.47935152, "learning_rate": 5.093e-05, "elapsed_time_per_iteration": 6.54274964, "memory(GiB)": 21.46, "elapsed_time": "17m 48s", "remaining_time": "11h 21m 15s", "loss_scale": 1.0, "consumed_samples": 41472, "global_step/max_steps": "162/6362"} +{"lm loss": 5.86156988, "grad_norm": 1.72213054, "learning_rate": 5.124e-05, "elapsed_time_per_iteration": 6.62510777, "memory(GiB)": 21.46, "elapsed_time": "17m 54s", "remaining_time": "11h 21m 10s", "loss_scale": 1.0, "consumed_samples": 41728, "global_step/max_steps": "163/6362"} +{"lm loss": 5.85987997, "grad_norm": 2.92976999, "learning_rate": 5.156e-05, "elapsed_time_per_iteration": 6.52715111, "memory(GiB)": 21.46, "elapsed_time": "18m 1s", "remaining_time": "11h 21m 1s", "loss_scale": 1.0, "consumed_samples": 41984, "global_step/max_steps": "164/6362"} +{"lm loss": 5.85933542, "grad_norm": 1.65587807, "learning_rate": 5.187e-05, "elapsed_time_per_iteration": 6.52031136, "memory(GiB)": 21.46, "elapsed_time": "18m 7s", "remaining_time": "11h 20m 52s", "loss_scale": 1.0, "consumed_samples": 42240, "global_step/max_steps": "165/6362"} +{"lm loss": 5.87544632, "grad_norm": 2.48307037, "learning_rate": 5.218e-05, "elapsed_time_per_iteration": 6.57108045, "memory(GiB)": 21.46, "elapsed_time": "18m 14s", "remaining_time": "11h 20m 44s", "loss_scale": 1.0, "consumed_samples": 42496, "global_step/max_steps": "166/6362"} +{"lm loss": 5.86763048, "grad_norm": 1.78092349, "learning_rate": 5.25e-05, "elapsed_time_per_iteration": 6.56338716, "memory(GiB)": 21.46, "elapsed_time": "18m 20s", "remaining_time": "11h 20m 37s", "loss_scale": 1.0, "consumed_samples": 42752, "global_step/max_steps": "167/6362"} +{"lm loss": 5.86842442, "grad_norm": 2.32510662, "learning_rate": 5.281e-05, "elapsed_time_per_iteration": 6.55288672, "memory(GiB)": 21.46, "elapsed_time": "18m 27s", "remaining_time": "11h 20m 29s", "loss_scale": 1.0, "consumed_samples": 43008, "global_step/max_steps": "168/6362"} +{"lm loss": 5.84864855, "grad_norm": 1.61807823, "learning_rate": 5.313e-05, "elapsed_time_per_iteration": 6.7223556, "memory(GiB)": 21.46, "elapsed_time": "18m 34s", "remaining_time": "11h 20m 27s", "loss_scale": 1.0, "consumed_samples": 43264, "global_step/max_steps": "169/6362"} +{"lm loss": 5.85200644, "grad_norm": 2.45457578, "learning_rate": 5.344e-05, "elapsed_time_per_iteration": 6.62129211, "memory(GiB)": 21.46, "elapsed_time": "18m 40s", "remaining_time": "11h 20m 21s", "loss_scale": 1.0, "consumed_samples": 43520, "global_step/max_steps": "170/6362"} +{"lm loss": 5.84010315, "grad_norm": 1.92252088, "learning_rate": 5.376e-05, "elapsed_time_per_iteration": 6.4010911, "memory(GiB)": 21.46, "elapsed_time": "18m 47s", "remaining_time": "11h 20m 8s", "loss_scale": 1.0, "consumed_samples": 43776, "global_step/max_steps": "171/6362"} +{"lm loss": 5.84977388, "grad_norm": 2.46843767, "learning_rate": 5.407e-05, "elapsed_time_per_iteration": 6.62376428, "memory(GiB)": 21.46, "elapsed_time": "18m 53s", "remaining_time": "11h 20m 2s", "loss_scale": 1.0, "consumed_samples": 44032, "global_step/max_steps": "172/6362"} +{"lm loss": 5.85123444, "grad_norm": 2.30152392, "learning_rate": 5.439e-05, "elapsed_time_per_iteration": 6.49993086, "memory(GiB)": 21.46, "elapsed_time": "19m 0s", "remaining_time": "11h 19m 52s", "loss_scale": 1.0, "consumed_samples": 44288, "global_step/max_steps": "173/6362"} +{"lm loss": 5.82253981, "grad_norm": 2.04706001, "learning_rate": 5.47e-05, "elapsed_time_per_iteration": 6.50711751, "memory(GiB)": 21.46, "elapsed_time": "19m 6s", "remaining_time": "11h 19m 43s", "loss_scale": 1.0, "consumed_samples": 44544, "global_step/max_steps": "174/6362"} +{"lm loss": 5.81592512, "grad_norm": 1.96876097, "learning_rate": 5.501e-05, "elapsed_time_per_iteration": 6.36428285, "memory(GiB)": 21.46, "elapsed_time": "19m 13s", "remaining_time": "11h 19m 28s", "loss_scale": 1.0, "consumed_samples": 44800, "global_step/max_steps": "175/6362"} +{"lm loss": 5.82492828, "grad_norm": 1.92042637, "learning_rate": 5.533e-05, "elapsed_time_per_iteration": 6.53341818, "memory(GiB)": 21.46, "elapsed_time": "19m 19s", "remaining_time": "11h 19m 20s", "loss_scale": 1.0, "consumed_samples": 45056, "global_step/max_steps": "176/6362"} +{"lm loss": 5.83924866, "grad_norm": 1.67214966, "learning_rate": 5.564e-05, "elapsed_time_per_iteration": 6.44736862, "memory(GiB)": 21.46, "elapsed_time": "19m 26s", "remaining_time": "11h 19m 8s", "loss_scale": 1.0, "consumed_samples": 45312, "global_step/max_steps": "177/6362"} +{"lm loss": 5.83348322, "grad_norm": 2.60215187, "learning_rate": 5.596e-05, "elapsed_time_per_iteration": 6.60954618, "memory(GiB)": 21.46, "elapsed_time": "19m 32s", "remaining_time": "11h 19m 2s", "loss_scale": 1.0, "consumed_samples": 45568, "global_step/max_steps": "178/6362"} +{"lm loss": 5.82293272, "grad_norm": 2.00865531, "learning_rate": 5.627e-05, "elapsed_time_per_iteration": 6.45432162, "memory(GiB)": 21.46, "elapsed_time": "19m 39s", "remaining_time": "11h 18m 51s", "loss_scale": 1.0, "consumed_samples": 45824, "global_step/max_steps": "179/6362"} +{"lm loss": 5.81917858, "grad_norm": 2.09116197, "learning_rate": 5.659e-05, "elapsed_time_per_iteration": 6.44038153, "memory(GiB)": 21.46, "elapsed_time": "19m 45s", "remaining_time": "11h 18m 39s", "loss_scale": 1.0, "consumed_samples": 46080, "global_step/max_steps": "180/6362"} +{"lm loss": 5.80894995, "grad_norm": 2.1776495, "learning_rate": 5.69e-05, "elapsed_time_per_iteration": 6.61289549, "memory(GiB)": 21.46, "elapsed_time": "19m 52s", "remaining_time": "11h 18m 34s", "loss_scale": 1.0, "consumed_samples": 46336, "global_step/max_steps": "181/6362"} +{"lm loss": 5.82347488, "grad_norm": 1.97914052, "learning_rate": 5.721e-05, "elapsed_time_per_iteration": 6.38021421, "memory(GiB)": 21.46, "elapsed_time": "19m 58s", "remaining_time": "11h 18m 20s", "loss_scale": 1.0, "consumed_samples": 46592, "global_step/max_steps": "182/6362"} +{"lm loss": 5.8229022, "grad_norm": 3.14197755, "learning_rate": 5.753e-05, "elapsed_time_per_iteration": 6.51340842, "memory(GiB)": 21.46, "elapsed_time": "20m 5s", "remaining_time": "11h 18m 11s", "loss_scale": 1.0, "consumed_samples": 46848, "global_step/max_steps": "183/6362"} +{"lm loss": 5.8103466, "grad_norm": 1.79729784, "learning_rate": 5.784e-05, "elapsed_time_per_iteration": 6.58793068, "memory(GiB)": 21.46, "elapsed_time": "20m 11s", "remaining_time": "11h 18m 5s", "loss_scale": 1.0, "consumed_samples": 47104, "global_step/max_steps": "184/6362"} +{"lm loss": 5.8245101, "grad_norm": 2.67511106, "learning_rate": 5.816e-05, "elapsed_time_per_iteration": 6.56078649, "memory(GiB)": 21.46, "elapsed_time": "20m 18s", "remaining_time": "11h 17m 57s", "loss_scale": 1.0, "consumed_samples": 47360, "global_step/max_steps": "185/6362"} +{"lm loss": 5.80010748, "grad_norm": 1.38302028, "learning_rate": 5.847e-05, "elapsed_time_per_iteration": 6.63167119, "memory(GiB)": 21.46, "elapsed_time": "20m 24s", "remaining_time": "11h 17m 52s", "loss_scale": 1.0, "consumed_samples": 47616, "global_step/max_steps": "186/6362"} +{"lm loss": 5.8157382, "grad_norm": 3.30126858, "learning_rate": 5.879e-05, "elapsed_time_per_iteration": 6.468086, "memory(GiB)": 21.46, "elapsed_time": "20m 31s", "remaining_time": "11h 17m 42s", "loss_scale": 1.0, "consumed_samples": 47872, "global_step/max_steps": "187/6362"} +{"lm loss": 5.81463289, "grad_norm": 2.06785274, "learning_rate": 5.91e-05, "elapsed_time_per_iteration": 6.479913, "memory(GiB)": 21.46, "elapsed_time": "20m 37s", "remaining_time": "11h 17m 32s", "loss_scale": 1.0, "consumed_samples": 48128, "global_step/max_steps": "188/6362"} +{"lm loss": 5.80488777, "grad_norm": 2.41375709, "learning_rate": 5.942e-05, "elapsed_time_per_iteration": 6.64117622, "memory(GiB)": 21.46, "elapsed_time": "20m 44s", "remaining_time": "11h 17m 27s", "loss_scale": 1.0, "consumed_samples": 48384, "global_step/max_steps": "189/6362"} +{"lm loss": 5.82666779, "grad_norm": 2.11778212, "learning_rate": 5.973e-05, "elapsed_time_per_iteration": 6.47750282, "memory(GiB)": 21.46, "elapsed_time": "20m 50s", "remaining_time": "11h 17m 17s", "loss_scale": 1.0, "consumed_samples": 48640, "global_step/max_steps": "190/6362"} +{"lm loss": 5.80697918, "grad_norm": 2.26248074, "learning_rate": 6.004e-05, "elapsed_time_per_iteration": 6.54447365, "memory(GiB)": 21.46, "elapsed_time": "20m 57s", "remaining_time": "11h 17m 9s", "loss_scale": 1.0, "consumed_samples": 48896, "global_step/max_steps": "191/6362"} +{"lm loss": 5.78995466, "grad_norm": 2.24587941, "learning_rate": 6.036e-05, "elapsed_time_per_iteration": 6.53250837, "memory(GiB)": 21.46, "elapsed_time": "21m 4s", "remaining_time": "11h 17m 1s", "loss_scale": 1.0, "consumed_samples": 49152, "global_step/max_steps": "192/6362"} +{"lm loss": 5.80225039, "grad_norm": 2.57044888, "learning_rate": 6.067e-05, "elapsed_time_per_iteration": 6.69442749, "memory(GiB)": 21.46, "elapsed_time": "21m 10s", "remaining_time": "11h 16m 58s", "loss_scale": 1.0, "consumed_samples": 49408, "global_step/max_steps": "193/6362"} +{"lm loss": 5.79426718, "grad_norm": 1.98577249, "learning_rate": 6.099e-05, "elapsed_time_per_iteration": 6.47940779, "memory(GiB)": 21.46, "elapsed_time": "21m 17s", "remaining_time": "11h 16m 48s", "loss_scale": 1.0, "consumed_samples": 49664, "global_step/max_steps": "194/6362"} +{"lm loss": 5.78890562, "grad_norm": 2.34051132, "learning_rate": 6.13e-05, "elapsed_time_per_iteration": 6.35659719, "memory(GiB)": 21.46, "elapsed_time": "21m 23s", "remaining_time": "11h 16m 34s", "loss_scale": 1.0, "consumed_samples": 49920, "global_step/max_steps": "195/6362"} +{"lm loss": 5.80160522, "grad_norm": 2.40052509, "learning_rate": 6.162e-05, "elapsed_time_per_iteration": 6.63660026, "memory(GiB)": 21.46, "elapsed_time": "21m 30s", "remaining_time": "11h 16m 29s", "loss_scale": 1.0, "consumed_samples": 50176, "global_step/max_steps": "196/6362"} +{"lm loss": 5.78919077, "grad_norm": 1.72407007, "learning_rate": 6.193e-05, "elapsed_time_per_iteration": 6.39260292, "memory(GiB)": 21.48, "elapsed_time": "21m 36s", "remaining_time": "11h 16m 17s", "loss_scale": 1.0, "consumed_samples": 50432, "global_step/max_steps": "197/6362"} +{"lm loss": 5.79379272, "grad_norm": 1.7729373, "learning_rate": 6.224e-05, "elapsed_time_per_iteration": 6.42482829, "memory(GiB)": 21.48, "elapsed_time": "21m 43s", "remaining_time": "11h 16m 5s", "loss_scale": 1.0, "consumed_samples": 50688, "global_step/max_steps": "198/6362"} +{"lm loss": 5.78650713, "grad_norm": 2.61923814, "learning_rate": 6.256e-05, "elapsed_time_per_iteration": 6.63386917, "memory(GiB)": 21.48, "elapsed_time": "21m 49s", "remaining_time": "11h 16m 0s", "loss_scale": 1.0, "consumed_samples": 50944, "global_step/max_steps": "199/6362"} +{"lm loss": 5.80067968, "grad_norm": 1.73631012, "learning_rate": 6.287e-05, "elapsed_time_per_iteration": 6.7726202, "memory(GiB)": 21.48, "elapsed_time": "21m 56s", "remaining_time": "11h 15m 59s", "loss_scale": 1.0, "consumed_samples": 51200, "global_step/max_steps": "200/6362"} +{"lm loss": 5.78871202, "grad_norm": 2.27645135, "learning_rate": 6.319e-05, "elapsed_time_per_iteration": 6.63740945, "memory(GiB)": 21.48, "elapsed_time": "22m 3s", "remaining_time": "11h 15m 55s", "loss_scale": 1.0, "consumed_samples": 51456, "global_step/max_steps": "201/6362"} +{"lm loss": 5.79474163, "grad_norm": 2.0291996, "learning_rate": 6.35e-05, "elapsed_time_per_iteration": 6.68464065, "memory(GiB)": 21.48, "elapsed_time": "22m 9s", "remaining_time": "11h 15m 51s", "loss_scale": 1.0, "consumed_samples": 51712, "global_step/max_steps": "202/6362"} +{"lm loss": 5.78001118, "grad_norm": 2.5980823, "learning_rate": 6.382e-05, "elapsed_time_per_iteration": 6.39135957, "memory(GiB)": 21.48, "elapsed_time": "22m 16s", "remaining_time": "11h 15m 39s", "loss_scale": 1.0, "consumed_samples": 51968, "global_step/max_steps": "203/6362"} +{"lm loss": 5.78634357, "grad_norm": 2.07940316, "learning_rate": 6.413e-05, "elapsed_time_per_iteration": 6.63736582, "memory(GiB)": 21.48, "elapsed_time": "22m 22s", "remaining_time": "11h 15m 34s", "loss_scale": 1.0, "consumed_samples": 52224, "global_step/max_steps": "204/6362"} +{"lm loss": 5.77483559, "grad_norm": 2.26634836, "learning_rate": 6.445e-05, "elapsed_time_per_iteration": 6.62155485, "memory(GiB)": 21.48, "elapsed_time": "22m 29s", "remaining_time": "11h 15m 28s", "loss_scale": 1.0, "consumed_samples": 52480, "global_step/max_steps": "205/6362"} +{"lm loss": 5.79479313, "grad_norm": 1.84170139, "learning_rate": 6.476e-05, "elapsed_time_per_iteration": 6.31524396, "memory(GiB)": 21.48, "elapsed_time": "22m 35s", "remaining_time": "11h 15m 14s", "loss_scale": 1.0, "consumed_samples": 52736, "global_step/max_steps": "206/6362"} +{"lm loss": 5.80010748, "grad_norm": 2.56543636, "learning_rate": 6.507e-05, "elapsed_time_per_iteration": 6.49868488, "memory(GiB)": 21.48, "elapsed_time": "22m 42s", "remaining_time": "11h 15m 5s", "loss_scale": 1.0, "consumed_samples": 52992, "global_step/max_steps": "207/6362"} +{"lm loss": 5.75271654, "grad_norm": 2.11600685, "learning_rate": 6.539e-05, "elapsed_time_per_iteration": 6.35655761, "memory(GiB)": 21.48, "elapsed_time": "22m 48s", "remaining_time": "11h 14m 52s", "loss_scale": 1.0, "consumed_samples": 53248, "global_step/max_steps": "208/6362"} +{"lm loss": 5.76977825, "grad_norm": 2.39720702, "learning_rate": 6.57e-05, "elapsed_time_per_iteration": 6.48629022, "memory(GiB)": 21.48, "elapsed_time": "22m 55s", "remaining_time": "11h 14m 42s", "loss_scale": 1.0, "consumed_samples": 53504, "global_step/max_steps": "209/6362"} +{"lm loss": 5.78668165, "grad_norm": 1.98452806, "learning_rate": 6.602e-05, "elapsed_time_per_iteration": 6.4516449, "memory(GiB)": 21.48, "elapsed_time": "23m 1s", "remaining_time": "11h 14m 32s", "loss_scale": 1.0, "consumed_samples": 53760, "global_step/max_steps": "210/6362"} +{"lm loss": 5.76176977, "grad_norm": 3.01932788, "learning_rate": 6.633e-05, "elapsed_time_per_iteration": 6.65230536, "memory(GiB)": 21.48, "elapsed_time": "23m 8s", "remaining_time": "11h 14m 27s", "loss_scale": 1.0, "consumed_samples": 54016, "global_step/max_steps": "211/6362"} +{"lm loss": 5.76241112, "grad_norm": 1.89934301, "learning_rate": 6.665e-05, "elapsed_time_per_iteration": 6.47007155, "memory(GiB)": 21.48, "elapsed_time": "23m 14s", "remaining_time": "11h 14m 18s", "loss_scale": 1.0, "consumed_samples": 54272, "global_step/max_steps": "212/6362"} +{"lm loss": 5.76248217, "grad_norm": 1.91309297, "learning_rate": 6.696e-05, "elapsed_time_per_iteration": 6.27693009, "memory(GiB)": 21.48, "elapsed_time": "23m 20s", "remaining_time": "11h 14m 2s", "loss_scale": 1.0, "consumed_samples": 54528, "global_step/max_steps": "213/6362"} +{"lm loss": 5.75856543, "grad_norm": 2.44234729, "learning_rate": 6.727e-05, "elapsed_time_per_iteration": 6.38477659, "memory(GiB)": 21.48, "elapsed_time": "23m 27s", "remaining_time": "11h 13m 50s", "loss_scale": 1.0, "consumed_samples": 54784, "global_step/max_steps": "214/6362"} +{"lm loss": 5.74364376, "grad_norm": 1.79091311, "learning_rate": 6.759e-05, "elapsed_time_per_iteration": 6.53455663, "memory(GiB)": 21.48, "elapsed_time": "23m 33s", "remaining_time": "11h 13m 43s", "loss_scale": 1.0, "consumed_samples": 55040, "global_step/max_steps": "215/6362"} +{"lm loss": 5.76804113, "grad_norm": 2.69768381, "learning_rate": 6.79e-05, "elapsed_time_per_iteration": 6.71147084, "memory(GiB)": 21.48, "elapsed_time": "23m 40s", "remaining_time": "11h 13m 40s", "loss_scale": 1.0, "consumed_samples": 55296, "global_step/max_steps": "216/6362"} +{"lm loss": 5.76590586, "grad_norm": 1.79428434, "learning_rate": 6.822e-05, "elapsed_time_per_iteration": 6.58169007, "memory(GiB)": 21.48, "elapsed_time": "23m 47s", "remaining_time": "11h 13m 33s", "loss_scale": 1.0, "consumed_samples": 55552, "global_step/max_steps": "217/6362"} +{"lm loss": 5.74884176, "grad_norm": 2.16591358, "learning_rate": 6.853e-05, "elapsed_time_per_iteration": 6.47490788, "memory(GiB)": 21.48, "elapsed_time": "23m 53s", "remaining_time": "11h 13m 24s", "loss_scale": 1.0, "consumed_samples": 55808, "global_step/max_steps": "218/6362"} +{"lm loss": 5.76341915, "grad_norm": 2.22769451, "learning_rate": 6.885e-05, "elapsed_time_per_iteration": 6.52013469, "memory(GiB)": 21.48, "elapsed_time": "24m 0s", "remaining_time": "11h 13m 16s", "loss_scale": 1.0, "consumed_samples": 56064, "global_step/max_steps": "219/6362"} +{"lm loss": 5.7251358, "grad_norm": 2.30821109, "learning_rate": 6.916e-05, "elapsed_time_per_iteration": 6.71687245, "memory(GiB)": 21.48, "elapsed_time": "24m 6s", "remaining_time": "11h 13m 13s", "loss_scale": 1.0, "consumed_samples": 56320, "global_step/max_steps": "220/6362"} +{"lm loss": 5.73849869, "grad_norm": 2.33240461, "learning_rate": 6.948e-05, "elapsed_time_per_iteration": 6.46020651, "memory(GiB)": 21.48, "elapsed_time": "24m 13s", "remaining_time": "11h 13m 3s", "loss_scale": 1.0, "consumed_samples": 56576, "global_step/max_steps": "221/6362"} +{"lm loss": 5.73505783, "grad_norm": 2.25819945, "learning_rate": 6.979e-05, "elapsed_time_per_iteration": 6.52227497, "memory(GiB)": 21.48, "elapsed_time": "24m 19s", "remaining_time": "11h 12m 55s", "loss_scale": 1.0, "consumed_samples": 56832, "global_step/max_steps": "222/6362"} +{"lm loss": 5.74613571, "grad_norm": 2.09427023, "learning_rate": 7.01e-05, "elapsed_time_per_iteration": 6.40544701, "memory(GiB)": 21.48, "elapsed_time": "24m 26s", "remaining_time": "11h 12m 44s", "loss_scale": 1.0, "consumed_samples": 57088, "global_step/max_steps": "223/6362"} +{"lm loss": 5.73427486, "grad_norm": 2.33301091, "learning_rate": 7.042e-05, "elapsed_time_per_iteration": 6.51290536, "memory(GiB)": 21.48, "elapsed_time": "24m 32s", "remaining_time": "11h 12m 36s", "loss_scale": 1.0, "consumed_samples": 57344, "global_step/max_steps": "224/6362"} +{"lm loss": 5.74156666, "grad_norm": 2.09488487, "learning_rate": 7.073e-05, "elapsed_time_per_iteration": 6.58829498, "memory(GiB)": 21.48, "elapsed_time": "24m 39s", "remaining_time": "11h 12m 30s", "loss_scale": 1.0, "consumed_samples": 57600, "global_step/max_steps": "225/6362"} +{"lm loss": 5.73589468, "grad_norm": 2.48346019, "learning_rate": 7.105e-05, "elapsed_time_per_iteration": 6.55364537, "memory(GiB)": 21.48, "elapsed_time": "24m 45s", "remaining_time": "11h 12m 22s", "loss_scale": 1.0, "consumed_samples": 57856, "global_step/max_steps": "226/6362"} +{"lm loss": 5.72919464, "grad_norm": 2.01421833, "learning_rate": 7.136e-05, "elapsed_time_per_iteration": 6.39198518, "memory(GiB)": 21.48, "elapsed_time": "24m 52s", "remaining_time": "11h 12m 11s", "loss_scale": 1.0, "consumed_samples": 58112, "global_step/max_steps": "227/6362"} +{"lm loss": 5.72386885, "grad_norm": 2.53450108, "learning_rate": 7.168e-05, "elapsed_time_per_iteration": 6.37529063, "memory(GiB)": 21.48, "elapsed_time": "24m 58s", "remaining_time": "11h 11m 59s", "loss_scale": 1.0, "consumed_samples": 58368, "global_step/max_steps": "228/6362"} +{"lm loss": 5.74181461, "grad_norm": 2.4542048, "learning_rate": 7.199e-05, "elapsed_time_per_iteration": 6.49391294, "memory(GiB)": 21.48, "elapsed_time": "25m 5s", "remaining_time": "11h 11m 50s", "loss_scale": 1.0, "consumed_samples": 58624, "global_step/max_steps": "229/6362"} +{"lm loss": 5.69290972, "grad_norm": 1.75506485, "learning_rate": 7.23e-05, "elapsed_time_per_iteration": 6.72851682, "memory(GiB)": 21.48, "elapsed_time": "25m 11s", "remaining_time": "11h 11m 48s", "loss_scale": 1.0, "consumed_samples": 58880, "global_step/max_steps": "230/6362"} +{"lm loss": 5.73995829, "grad_norm": 2.26226401, "learning_rate": 7.262e-05, "elapsed_time_per_iteration": 6.71356177, "memory(GiB)": 21.48, "elapsed_time": "25m 18s", "remaining_time": "11h 11m 45s", "loss_scale": 1.0, "consumed_samples": 59136, "global_step/max_steps": "231/6362"} +{"lm loss": 5.72281742, "grad_norm": 2.06189299, "learning_rate": 7.293e-05, "elapsed_time_per_iteration": 6.34603715, "memory(GiB)": 21.48, "elapsed_time": "25m 24s", "remaining_time": "11h 11m 32s", "loss_scale": 1.0, "consumed_samples": 59392, "global_step/max_steps": "232/6362"} +{"lm loss": 5.72397327, "grad_norm": 2.29671168, "learning_rate": 7.325e-05, "elapsed_time_per_iteration": 6.21525121, "memory(GiB)": 21.48, "elapsed_time": "25m 31s", "remaining_time": "11h 11m 16s", "loss_scale": 1.0, "consumed_samples": 59648, "global_step/max_steps": "233/6362"} +{"lm loss": 5.72773314, "grad_norm": 2.37950277, "learning_rate": 7.356e-05, "elapsed_time_per_iteration": 6.34388971, "memory(GiB)": 21.48, "elapsed_time": "25m 37s", "remaining_time": "11h 11m 4s", "loss_scale": 1.0, "consumed_samples": 59904, "global_step/max_steps": "234/6362"} +{"lm loss": 5.72669125, "grad_norm": 2.21164274, "learning_rate": 7.388e-05, "elapsed_time_per_iteration": 6.36039615, "memory(GiB)": 21.48, "elapsed_time": "25m 43s", "remaining_time": "11h 10m 52s", "loss_scale": 1.0, "consumed_samples": 60160, "global_step/max_steps": "235/6362"} +{"lm loss": 5.72137451, "grad_norm": 2.19752765, "learning_rate": 7.419e-05, "elapsed_time_per_iteration": 6.3327601, "memory(GiB)": 21.48, "elapsed_time": "25m 50s", "remaining_time": "11h 10m 39s", "loss_scale": 1.0, "consumed_samples": 60416, "global_step/max_steps": "236/6362"} +{"lm loss": 5.69213343, "grad_norm": 2.00574398, "learning_rate": 7.45e-05, "elapsed_time_per_iteration": 6.36738372, "memory(GiB)": 21.48, "elapsed_time": "25m 56s", "remaining_time": "11h 10m 27s", "loss_scale": 1.0, "consumed_samples": 60672, "global_step/max_steps": "237/6362"} +{"lm loss": 5.70022154, "grad_norm": 1.9630456, "learning_rate": 7.482e-05, "elapsed_time_per_iteration": 6.4550333, "memory(GiB)": 21.48, "elapsed_time": "26m 3s", "remaining_time": "11h 10m 18s", "loss_scale": 1.0, "consumed_samples": 60928, "global_step/max_steps": "238/6362"} +{"lm loss": 5.72251225, "grad_norm": 2.45355725, "learning_rate": 7.513e-05, "elapsed_time_per_iteration": 6.38009286, "memory(GiB)": 21.48, "elapsed_time": "26m 9s", "remaining_time": "11h 10m 6s", "loss_scale": 1.0, "consumed_samples": 61184, "global_step/max_steps": "239/6362"} +{"lm loss": 5.70994806, "grad_norm": 2.39566183, "learning_rate": 7.545e-05, "elapsed_time_per_iteration": 6.50541997, "memory(GiB)": 21.48, "elapsed_time": "26m 15s", "remaining_time": "11h 9m 58s", "loss_scale": 1.0, "consumed_samples": 61440, "global_step/max_steps": "240/6362"} +{"lm loss": 5.70623875, "grad_norm": 2.27897406, "learning_rate": 7.576e-05, "elapsed_time_per_iteration": 6.42196679, "memory(GiB)": 21.48, "elapsed_time": "26m 22s", "remaining_time": "11h 9m 48s", "loss_scale": 1.0, "consumed_samples": 61696, "global_step/max_steps": "241/6362"} +{"lm loss": 5.70648241, "grad_norm": 2.04036736, "learning_rate": 7.608e-05, "elapsed_time_per_iteration": 6.49665761, "memory(GiB)": 21.48, "elapsed_time": "26m 28s", "remaining_time": "11h 9m 40s", "loss_scale": 1.0, "consumed_samples": 61952, "global_step/max_steps": "242/6362"} +{"lm loss": 5.71302652, "grad_norm": 2.22713304, "learning_rate": 7.639e-05, "elapsed_time_per_iteration": 6.36525989, "memory(GiB)": 21.48, "elapsed_time": "26m 35s", "remaining_time": "11h 9m 28s", "loss_scale": 1.0, "consumed_samples": 62208, "global_step/max_steps": "243/6362"} +{"lm loss": 5.69304657, "grad_norm": 2.02292132, "learning_rate": 7.671e-05, "elapsed_time_per_iteration": 6.35323668, "memory(GiB)": 21.48, "elapsed_time": "26m 41s", "remaining_time": "11h 9m 16s", "loss_scale": 1.0, "consumed_samples": 62464, "global_step/max_steps": "244/6362"} +{"lm loss": 5.69756126, "grad_norm": 1.87969398, "learning_rate": 7.702e-05, "elapsed_time_per_iteration": 6.37607861, "memory(GiB)": 21.48, "elapsed_time": "26m 47s", "remaining_time": "11h 9m 5s", "loss_scale": 1.0, "consumed_samples": 62720, "global_step/max_steps": "245/6362"} +{"lm loss": 5.69200563, "grad_norm": 2.19272327, "learning_rate": 7.733e-05, "elapsed_time_per_iteration": 6.3564508, "memory(GiB)": 21.48, "elapsed_time": "26m 54s", "remaining_time": "11h 8m 53s", "loss_scale": 1.0, "consumed_samples": 62976, "global_step/max_steps": "246/6362"} +{"lm loss": 5.7061491, "grad_norm": 2.64877343, "learning_rate": 7.765e-05, "elapsed_time_per_iteration": 6.22782564, "memory(GiB)": 21.48, "elapsed_time": "27m 0s", "remaining_time": "11h 8m 39s", "loss_scale": 1.0, "consumed_samples": 63232, "global_step/max_steps": "247/6362"} +{"lm loss": 5.67272377, "grad_norm": 1.56915557, "learning_rate": 7.796e-05, "elapsed_time_per_iteration": 6.42218256, "memory(GiB)": 21.48, "elapsed_time": "27m 6s", "remaining_time": "11h 8m 29s", "loss_scale": 1.0, "consumed_samples": 63488, "global_step/max_steps": "248/6362"} +{"lm loss": 5.69324875, "grad_norm": 2.27398515, "learning_rate": 7.828e-05, "elapsed_time_per_iteration": 6.55026937, "memory(GiB)": 21.48, "elapsed_time": "27m 13s", "remaining_time": "11h 8m 22s", "loss_scale": 1.0, "consumed_samples": 63744, "global_step/max_steps": "249/6362"} +{"lm loss": 5.69044447, "grad_norm": 1.7427057, "learning_rate": 7.859e-05, "elapsed_time_per_iteration": 6.56044793, "memory(GiB)": 21.48, "elapsed_time": "27m 20s", "remaining_time": "11h 8m 15s", "loss_scale": 1.0, "consumed_samples": 64000, "global_step/max_steps": "250/6362"} +{"lm loss": 5.70687628, "grad_norm": 2.57650185, "learning_rate": 7.891e-05, "elapsed_time_per_iteration": 6.4033432, "memory(GiB)": 21.48, "elapsed_time": "27m 26s", "remaining_time": "11h 8m 5s", "loss_scale": 1.0, "consumed_samples": 64256, "global_step/max_steps": "251/6362"} +{"lm loss": 5.68092728, "grad_norm": 2.08144379, "learning_rate": 7.922e-05, "elapsed_time_per_iteration": 6.32720685, "memory(GiB)": 21.48, "elapsed_time": "27m 32s", "remaining_time": "11h 7m 53s", "loss_scale": 1.0, "consumed_samples": 64512, "global_step/max_steps": "252/6362"} +{"lm loss": 5.67928743, "grad_norm": 2.07689667, "learning_rate": 7.953e-05, "elapsed_time_per_iteration": 6.65624118, "memory(GiB)": 21.48, "elapsed_time": "27m 39s", "remaining_time": "11h 7m 48s", "loss_scale": 1.0, "consumed_samples": 64768, "global_step/max_steps": "253/6362"} +{"lm loss": 5.67651176, "grad_norm": 2.32555652, "learning_rate": 7.985e-05, "elapsed_time_per_iteration": 6.47616673, "memory(GiB)": 21.48, "elapsed_time": "27m 45s", "remaining_time": "11h 7m 40s", "loss_scale": 1.0, "consumed_samples": 65024, "global_step/max_steps": "254/6362"} +{"lm loss": 5.68684673, "grad_norm": 1.99531221, "learning_rate": 8.016e-05, "elapsed_time_per_iteration": 6.6890285, "memory(GiB)": 21.48, "elapsed_time": "27m 52s", "remaining_time": "11h 7m 36s", "loss_scale": 1.0, "consumed_samples": 65280, "global_step/max_steps": "255/6362"} +{"lm loss": 5.7015543, "grad_norm": 2.92265344, "learning_rate": 8.048e-05, "elapsed_time_per_iteration": 6.32649469, "memory(GiB)": 21.48, "elapsed_time": "27m 58s", "remaining_time": "11h 7m 24s", "loss_scale": 1.0, "consumed_samples": 65536, "global_step/max_steps": "256/6362"} +{"lm loss": 5.67107248, "grad_norm": 1.8691833, "learning_rate": 8.079e-05, "elapsed_time_per_iteration": 6.64051795, "memory(GiB)": 21.48, "elapsed_time": "28m 5s", "remaining_time": "11h 7m 20s", "loss_scale": 1.0, "consumed_samples": 65792, "global_step/max_steps": "257/6362"} +{"lm loss": 5.68496227, "grad_norm": 2.10428858, "learning_rate": 8.111e-05, "elapsed_time_per_iteration": 6.72422838, "memory(GiB)": 21.48, "elapsed_time": "28m 12s", "remaining_time": "11h 7m 17s", "loss_scale": 1.0, "consumed_samples": 66048, "global_step/max_steps": "258/6362"} +{"lm loss": 5.69436264, "grad_norm": 2.09075546, "learning_rate": 8.142e-05, "elapsed_time_per_iteration": 6.66414857, "memory(GiB)": 21.48, "elapsed_time": "28m 18s", "remaining_time": "11h 7m 13s", "loss_scale": 1.0, "consumed_samples": 66304, "global_step/max_steps": "259/6362"} +{"lm loss": 5.67560911, "grad_norm": 2.43881512, "learning_rate": 8.174e-05, "elapsed_time_per_iteration": 6.50854111, "memory(GiB)": 21.48, "elapsed_time": "28m 25s", "remaining_time": "11h 7m 5s", "loss_scale": 1.0, "consumed_samples": 66560, "global_step/max_steps": "260/6362"} +{"lm loss": 5.67585421, "grad_norm": 2.36686945, "learning_rate": 8.205e-05, "elapsed_time_per_iteration": 6.40499425, "memory(GiB)": 21.48, "elapsed_time": "28m 31s", "remaining_time": "11h 6m 55s", "loss_scale": 1.0, "consumed_samples": 66816, "global_step/max_steps": "261/6362"} +{"lm loss": 5.66422176, "grad_norm": 2.21270204, "learning_rate": 8.236e-05, "elapsed_time_per_iteration": 6.54433274, "memory(GiB)": 21.49, "elapsed_time": "28m 38s", "remaining_time": "11h 6m 48s", "loss_scale": 1.0, "consumed_samples": 67072, "global_step/max_steps": "262/6362"} +{"lm loss": 5.67031097, "grad_norm": 2.02726817, "learning_rate": 8.268e-05, "elapsed_time_per_iteration": 6.73026919, "memory(GiB)": 21.49, "elapsed_time": "28m 45s", "remaining_time": "11h 6m 46s", "loss_scale": 1.0, "consumed_samples": 67328, "global_step/max_steps": "263/6362"} +{"lm loss": 5.67463636, "grad_norm": 2.15042305, "learning_rate": 8.299e-05, "elapsed_time_per_iteration": 6.49050879, "memory(GiB)": 21.49, "elapsed_time": "28m 51s", "remaining_time": "11h 6m 37s", "loss_scale": 1.0, "consumed_samples": 67584, "global_step/max_steps": "264/6362"} +{"lm loss": 5.66858149, "grad_norm": 2.7707572, "learning_rate": 8.331e-05, "elapsed_time_per_iteration": 6.52783179, "memory(GiB)": 21.49, "elapsed_time": "28m 58s", "remaining_time": "11h 6m 30s", "loss_scale": 1.0, "consumed_samples": 67840, "global_step/max_steps": "265/6362"} +{"lm loss": 5.67893553, "grad_norm": 1.83633018, "learning_rate": 8.362e-05, "elapsed_time_per_iteration": 6.80179453, "memory(GiB)": 21.49, "elapsed_time": "29m 4s", "remaining_time": "11h 6m 29s", "loss_scale": 1.0, "consumed_samples": 68096, "global_step/max_steps": "266/6362"} +{"lm loss": 5.66833878, "grad_norm": 2.37668967, "learning_rate": 8.394e-05, "elapsed_time_per_iteration": 6.64874744, "memory(GiB)": 21.49, "elapsed_time": "29m 11s", "remaining_time": "11h 6m 25s", "loss_scale": 1.0, "consumed_samples": 68352, "global_step/max_steps": "267/6362"} +{"lm loss": 5.66233683, "grad_norm": 1.59426522, "learning_rate": 8.425e-05, "elapsed_time_per_iteration": 6.27366185, "memory(GiB)": 21.49, "elapsed_time": "29m 17s", "remaining_time": "11h 6m 12s", "loss_scale": 1.0, "consumed_samples": 68608, "global_step/max_steps": "268/6362"} +{"lm loss": 5.67535496, "grad_norm": 2.91507459, "learning_rate": 8.456e-05, "elapsed_time_per_iteration": 6.42149234, "memory(GiB)": 21.49, "elapsed_time": "29m 24s", "remaining_time": "11h 6m 2s", "loss_scale": 1.0, "consumed_samples": 68864, "global_step/max_steps": "269/6362"} +{"lm loss": 5.67708349, "grad_norm": 1.76324129, "learning_rate": 8.488e-05, "elapsed_time_per_iteration": 6.58592391, "memory(GiB)": 21.49, "elapsed_time": "29m 30s", "remaining_time": "11h 5m 56s", "loss_scale": 1.0, "consumed_samples": 69120, "global_step/max_steps": "270/6362"} +{"lm loss": 5.64825439, "grad_norm": 2.06461406, "learning_rate": 8.519e-05, "elapsed_time_per_iteration": 6.64675975, "memory(GiB)": 21.49, "elapsed_time": "29m 37s", "remaining_time": "11h 5m 51s", "loss_scale": 1.0, "consumed_samples": 69376, "global_step/max_steps": "271/6362"} +{"lm loss": 5.64510965, "grad_norm": 1.77856708, "learning_rate": 8.551e-05, "elapsed_time_per_iteration": 6.8003633, "memory(GiB)": 21.49, "elapsed_time": "29m 44s", "remaining_time": "11h 5m 50s", "loss_scale": 1.0, "consumed_samples": 69632, "global_step/max_steps": "272/6362"} +{"lm loss": 5.65858936, "grad_norm": 1.7386384, "learning_rate": 8.582e-05, "elapsed_time_per_iteration": 6.57075429, "memory(GiB)": 21.49, "elapsed_time": "29m 50s", "remaining_time": "11h 5m 44s", "loss_scale": 1.0, "consumed_samples": 69888, "global_step/max_steps": "273/6362"} +{"lm loss": 5.66350222, "grad_norm": 2.16993546, "learning_rate": 8.614e-05, "elapsed_time_per_iteration": 6.77169633, "memory(GiB)": 21.49, "elapsed_time": "29m 57s", "remaining_time": "11h 5m 42s", "loss_scale": 1.0, "consumed_samples": 70144, "global_step/max_steps": "274/6362"} +{"lm loss": 5.65277243, "grad_norm": 1.97792923, "learning_rate": 8.645e-05, "elapsed_time_per_iteration": 6.54320192, "memory(GiB)": 21.49, "elapsed_time": "30m 4s", "remaining_time": "11h 5m 35s", "loss_scale": 1.0, "consumed_samples": 70400, "global_step/max_steps": "275/6362"} +{"lm loss": 5.67620516, "grad_norm": 2.31283569, "learning_rate": 8.677e-05, "elapsed_time_per_iteration": 6.59502006, "memory(GiB)": 21.49, "elapsed_time": "30m 10s", "remaining_time": "11h 5m 29s", "loss_scale": 1.0, "consumed_samples": 70656, "global_step/max_steps": "276/6362"} +{"lm loss": 5.63197374, "grad_norm": 2.34173322, "learning_rate": 8.708e-05, "elapsed_time_per_iteration": 6.98240519, "memory(GiB)": 21.49, "elapsed_time": "30m 17s", "remaining_time": "11h 5m 32s", "loss_scale": 1.0, "consumed_samples": 70912, "global_step/max_steps": "277/6362"} +{"lm loss": 5.64341736, "grad_norm": 2.17520881, "learning_rate": 8.739e-05, "elapsed_time_per_iteration": 6.43566203, "memory(GiB)": 21.49, "elapsed_time": "30m 24s", "remaining_time": "11h 5m 23s", "loss_scale": 1.0, "consumed_samples": 71168, "global_step/max_steps": "278/6362"} +{"lm loss": 5.65076637, "grad_norm": 2.52419949, "learning_rate": 8.771e-05, "elapsed_time_per_iteration": 6.53869462, "memory(GiB)": 21.49, "elapsed_time": "30m 30s", "remaining_time": "11h 5m 16s", "loss_scale": 1.0, "consumed_samples": 71424, "global_step/max_steps": "279/6362"} +{"lm loss": 5.64180803, "grad_norm": 1.88133633, "learning_rate": 8.802e-05, "elapsed_time_per_iteration": 6.38195229, "memory(GiB)": 21.49, "elapsed_time": "30m 37s", "remaining_time": "11h 5m 5s", "loss_scale": 1.0, "consumed_samples": 71680, "global_step/max_steps": "280/6362"} +{"lm loss": 5.64872742, "grad_norm": 2.7146008, "learning_rate": 8.834e-05, "elapsed_time_per_iteration": 6.52956152, "memory(GiB)": 21.49, "elapsed_time": "30m 43s", "remaining_time": "11h 4m 58s", "loss_scale": 1.0, "consumed_samples": 71936, "global_step/max_steps": "281/6362"} +{"lm loss": 5.63492632, "grad_norm": 1.39466858, "learning_rate": 8.865e-05, "elapsed_time_per_iteration": 6.52590632, "memory(GiB)": 21.49, "elapsed_time": "30m 50s", "remaining_time": "11h 4m 51s", "loss_scale": 1.0, "consumed_samples": 72192, "global_step/max_steps": "282/6362"} +{"lm loss": 5.64646578, "grad_norm": 2.29139781, "learning_rate": 8.897e-05, "elapsed_time_per_iteration": 6.45587611, "memory(GiB)": 21.49, "elapsed_time": "30m 56s", "remaining_time": "11h 4m 42s", "loss_scale": 1.0, "consumed_samples": 72448, "global_step/max_steps": "283/6362"} +{"lm loss": 5.64230728, "grad_norm": 1.90249908, "learning_rate": 8.928e-05, "elapsed_time_per_iteration": 6.36597276, "memory(GiB)": 21.49, "elapsed_time": "31m 3s", "remaining_time": "11h 4m 31s", "loss_scale": 1.0, "consumed_samples": 72704, "global_step/max_steps": "284/6362"} +{"lm loss": 5.64664268, "grad_norm": 3.13985276, "learning_rate": 8.959e-05, "elapsed_time_per_iteration": 6.50146699, "memory(GiB)": 21.49, "elapsed_time": "31m 9s", "remaining_time": "11h 4m 23s", "loss_scale": 1.0, "consumed_samples": 72960, "global_step/max_steps": "285/6362"} +{"lm loss": 5.64434099, "grad_norm": 1.72210801, "learning_rate": 8.991e-05, "elapsed_time_per_iteration": 6.46312499, "memory(GiB)": 21.49, "elapsed_time": "31m 15s", "remaining_time": "11h 4m 15s", "loss_scale": 1.0, "consumed_samples": 73216, "global_step/max_steps": "286/6362"} +{"lm loss": 5.64215708, "grad_norm": 2.00726533, "learning_rate": 9.022e-05, "elapsed_time_per_iteration": 6.52057052, "memory(GiB)": 21.49, "elapsed_time": "31m 22s", "remaining_time": "11h 4m 7s", "loss_scale": 1.0, "consumed_samples": 73472, "global_step/max_steps": "287/6362"} +{"lm loss": 5.64763069, "grad_norm": 2.60412931, "learning_rate": 9.054e-05, "elapsed_time_per_iteration": 6.35887241, "memory(GiB)": 21.49, "elapsed_time": "31m 28s", "remaining_time": "11h 3m 56s", "loss_scale": 1.0, "consumed_samples": 73728, "global_step/max_steps": "288/6362"} +{"lm loss": 5.63426208, "grad_norm": 1.82000709, "learning_rate": 9.085e-05, "elapsed_time_per_iteration": 6.55890775, "memory(GiB)": 21.49, "elapsed_time": "31m 35s", "remaining_time": "11h 3m 50s", "loss_scale": 1.0, "consumed_samples": 73984, "global_step/max_steps": "289/6362"} +{"lm loss": 5.64308023, "grad_norm": 2.55639458, "learning_rate": 9.117e-05, "elapsed_time_per_iteration": 6.45620275, "memory(GiB)": 21.49, "elapsed_time": "31m 41s", "remaining_time": "11h 3m 41s", "loss_scale": 1.0, "consumed_samples": 74240, "global_step/max_steps": "290/6362"} +{"lm loss": 5.62394285, "grad_norm": 1.93808365, "learning_rate": 9.148e-05, "elapsed_time_per_iteration": 6.61750412, "memory(GiB)": 21.49, "elapsed_time": "31m 48s", "remaining_time": "11h 3m 36s", "loss_scale": 1.0, "consumed_samples": 74496, "global_step/max_steps": "291/6362"} +{"lm loss": 5.64326668, "grad_norm": 2.22070932, "learning_rate": 9.18e-05, "elapsed_time_per_iteration": 6.46842504, "memory(GiB)": 21.49, "elapsed_time": "31m 54s", "remaining_time": "11h 3m 27s", "loss_scale": 1.0, "consumed_samples": 74752, "global_step/max_steps": "292/6362"} +{"lm loss": 5.64099503, "grad_norm": 1.82638919, "learning_rate": 9.211e-05, "elapsed_time_per_iteration": 6.5498209, "memory(GiB)": 21.49, "elapsed_time": "32m 1s", "remaining_time": "11h 3m 21s", "loss_scale": 1.0, "consumed_samples": 75008, "global_step/max_steps": "293/6362"} +{"lm loss": 5.64428854, "grad_norm": 2.57634163, "learning_rate": 9.242e-05, "elapsed_time_per_iteration": 6.62773585, "memory(GiB)": 21.49, "elapsed_time": "32m 8s", "remaining_time": "11h 3m 16s", "loss_scale": 1.0, "consumed_samples": 75264, "global_step/max_steps": "294/6362"} +{"lm loss": 5.63316298, "grad_norm": 1.79476297, "learning_rate": 9.274e-05, "elapsed_time_per_iteration": 6.54458761, "memory(GiB)": 21.49, "elapsed_time": "32m 14s", "remaining_time": "11h 3m 9s", "loss_scale": 1.0, "consumed_samples": 75520, "global_step/max_steps": "295/6362"} +{"lm loss": 5.65223408, "grad_norm": 2.33698153, "learning_rate": 9.305e-05, "elapsed_time_per_iteration": 6.54239988, "memory(GiB)": 21.49, "elapsed_time": "32m 21s", "remaining_time": "11h 3m 2s", "loss_scale": 1.0, "consumed_samples": 75776, "global_step/max_steps": "296/6362"} +{"lm loss": 5.63049364, "grad_norm": 1.73437703, "learning_rate": 9.337e-05, "elapsed_time_per_iteration": 6.35817313, "memory(GiB)": 21.49, "elapsed_time": "32m 27s", "remaining_time": "11h 2m 51s", "loss_scale": 1.0, "consumed_samples": 76032, "global_step/max_steps": "297/6362"} +{"lm loss": 5.62389851, "grad_norm": 1.86171794, "learning_rate": 9.368e-05, "elapsed_time_per_iteration": 6.38882494, "memory(GiB)": 21.49, "elapsed_time": "32m 33s", "remaining_time": "11h 2m 41s", "loss_scale": 1.0, "consumed_samples": 76288, "global_step/max_steps": "298/6362"} +{"lm loss": 5.61387777, "grad_norm": 1.89595211, "learning_rate": 9.4e-05, "elapsed_time_per_iteration": 6.67956138, "memory(GiB)": 21.49, "elapsed_time": "32m 40s", "remaining_time": "11h 2m 37s", "loss_scale": 1.0, "consumed_samples": 76544, "global_step/max_steps": "299/6362"} +{"lm loss": 5.62377214, "grad_norm": 2.02750778, "learning_rate": 9.431e-05, "elapsed_time_per_iteration": 6.46366811, "memory(GiB)": 21.49, "elapsed_time": "32m 47s", "remaining_time": "11h 2m 29s", "loss_scale": 1.0, "consumed_samples": 76800, "global_step/max_steps": "300/6362"} +{"lm loss": 5.62425518, "grad_norm": 2.57659245, "learning_rate": 9.462e-05, "elapsed_time_per_iteration": 6.68622899, "memory(GiB)": 21.49, "elapsed_time": "32m 53s", "remaining_time": "11h 2m 25s", "loss_scale": 1.0, "consumed_samples": 77056, "global_step/max_steps": "301/6362"} +{"lm loss": 5.63258171, "grad_norm": 1.91590309, "learning_rate": 9.494e-05, "elapsed_time_per_iteration": 6.64965606, "memory(GiB)": 21.49, "elapsed_time": "33m 0s", "remaining_time": "11h 2m 20s", "loss_scale": 1.0, "consumed_samples": 77312, "global_step/max_steps": "302/6362"} +{"lm loss": 5.62947512, "grad_norm": 2.27740049, "learning_rate": 9.525e-05, "elapsed_time_per_iteration": 6.56754971, "memory(GiB)": 21.49, "elapsed_time": "33m 7s", "remaining_time": "11h 2m 14s", "loss_scale": 1.0, "consumed_samples": 77568, "global_step/max_steps": "303/6362"} +{"lm loss": 5.61915874, "grad_norm": 1.69068134, "learning_rate": 9.557e-05, "elapsed_time_per_iteration": 6.43192649, "memory(GiB)": 21.49, "elapsed_time": "33m 13s", "remaining_time": "11h 2m 5s", "loss_scale": 1.0, "consumed_samples": 77824, "global_step/max_steps": "304/6362"} +{"lm loss": 5.5965414, "grad_norm": 2.53787327, "learning_rate": 9.588e-05, "elapsed_time_per_iteration": 6.46261597, "memory(GiB)": 21.49, "elapsed_time": "33m 19s", "remaining_time": "11h 1m 56s", "loss_scale": 1.0, "consumed_samples": 78080, "global_step/max_steps": "305/6362"} +{"lm loss": 5.60729313, "grad_norm": 1.44701719, "learning_rate": 9.62e-05, "elapsed_time_per_iteration": 6.65315533, "memory(GiB)": 21.49, "elapsed_time": "33m 26s", "remaining_time": "11h 1m 51s", "loss_scale": 1.0, "consumed_samples": 78336, "global_step/max_steps": "306/6362"} +{"lm loss": 5.62737799, "grad_norm": 2.84599257, "learning_rate": 9.651e-05, "elapsed_time_per_iteration": 6.58834171, "memory(GiB)": 21.49, "elapsed_time": "33m 33s", "remaining_time": "11h 1m 45s", "loss_scale": 1.0, "consumed_samples": 78592, "global_step/max_steps": "307/6362"} +{"lm loss": 5.59471655, "grad_norm": 1.93479121, "learning_rate": 9.682e-05, "elapsed_time_per_iteration": 6.59542727, "memory(GiB)": 21.49, "elapsed_time": "33m 39s", "remaining_time": "11h 1m 40s", "loss_scale": 1.0, "consumed_samples": 78848, "global_step/max_steps": "308/6362"} +{"lm loss": 5.61510992, "grad_norm": 2.04138899, "learning_rate": 9.714e-05, "elapsed_time_per_iteration": 6.33526826, "memory(GiB)": 21.49, "elapsed_time": "33m 46s", "remaining_time": "11h 1m 29s", "loss_scale": 1.0, "consumed_samples": 79104, "global_step/max_steps": "309/6362"} +{"lm loss": 5.61813545, "grad_norm": 2.04764605, "learning_rate": 9.745e-05, "elapsed_time_per_iteration": 6.67879224, "memory(GiB)": 21.49, "elapsed_time": "33m 52s", "remaining_time": "11h 1m 25s", "loss_scale": 1.0, "consumed_samples": 79360, "global_step/max_steps": "310/6362"} +{"lm loss": 5.63945484, "grad_norm": 1.91938245, "learning_rate": 9.777e-05, "elapsed_time_per_iteration": 6.76470089, "memory(GiB)": 21.49, "elapsed_time": "33m 59s", "remaining_time": "11h 1m 22s", "loss_scale": 1.0, "consumed_samples": 79616, "global_step/max_steps": "311/6362"} +{"lm loss": 5.60534811, "grad_norm": 2.16953468, "learning_rate": 9.808e-05, "elapsed_time_per_iteration": 6.5839293, "memory(GiB)": 21.49, "elapsed_time": "34m 6s", "remaining_time": "11h 1m 16s", "loss_scale": 1.0, "consumed_samples": 79872, "global_step/max_steps": "312/6362"} +{"lm loss": 5.59651327, "grad_norm": 2.03791189, "learning_rate": 9.84e-05, "elapsed_time_per_iteration": 6.54186821, "memory(GiB)": 21.49, "elapsed_time": "34m 12s", "remaining_time": "11h 1m 9s", "loss_scale": 1.0, "consumed_samples": 80128, "global_step/max_steps": "313/6362"} +{"lm loss": 5.6055131, "grad_norm": 2.22165847, "learning_rate": 9.871e-05, "elapsed_time_per_iteration": 6.51712561, "memory(GiB)": 21.49, "elapsed_time": "34m 19s", "remaining_time": "11h 1m 2s", "loss_scale": 1.0, "consumed_samples": 80384, "global_step/max_steps": "314/6362"} +{"lm loss": 5.59577131, "grad_norm": 1.84915614, "learning_rate": 9.903e-05, "elapsed_time_per_iteration": 6.73665118, "memory(GiB)": 21.49, "elapsed_time": "34m 25s", "remaining_time": "11h 0m 59s", "loss_scale": 1.0, "consumed_samples": 80640, "global_step/max_steps": "315/6362"} +{"lm loss": 5.59557676, "grad_norm": 2.20289636, "learning_rate": 9.934e-05, "elapsed_time_per_iteration": 6.50187564, "memory(GiB)": 21.49, "elapsed_time": "34m 32s", "remaining_time": "11h 0m 51s", "loss_scale": 1.0, "consumed_samples": 80896, "global_step/max_steps": "316/6362"} +{"lm loss": 5.61299181, "grad_norm": 2.37082982, "learning_rate": 9.965e-05, "elapsed_time_per_iteration": 6.75880527, "memory(GiB)": 21.49, "elapsed_time": "34m 39s", "remaining_time": "11h 0m 48s", "loss_scale": 1.0, "consumed_samples": 81152, "global_step/max_steps": "317/6362"} +{"lm loss": 5.5949111, "grad_norm": 1.81862962, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.6246655, "memory(GiB)": 21.49, "elapsed_time": "34m 45s", "remaining_time": "11h 0m 43s", "loss_scale": 1.0, "consumed_samples": 81408, "global_step/max_steps": "318/6362"} +{"lm loss": 5.60278177, "grad_norm": 2.20502043, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.66566563, "memory(GiB)": 21.49, "elapsed_time": "34m 52s", "remaining_time": "11h 0m 38s", "loss_scale": 1.0, "consumed_samples": 81664, "global_step/max_steps": "319/6362"} +{"lm loss": 5.59517193, "grad_norm": 1.96559882, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.75581503, "memory(GiB)": 21.49, "elapsed_time": "34m 59s", "remaining_time": "11h 0m 36s", "loss_scale": 1.0, "consumed_samples": 81920, "global_step/max_steps": "320/6362"} +{"lm loss": 5.60710144, "grad_norm": 2.20090961, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.81259656, "memory(GiB)": 21.49, "elapsed_time": "35m 6s", "remaining_time": "11h 0m 34s", "loss_scale": 1.0, "consumed_samples": 82176, "global_step/max_steps": "321/6362"} +{"lm loss": 5.59276485, "grad_norm": 1.89106226, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.54077435, "memory(GiB)": 21.49, "elapsed_time": "35m 12s", "remaining_time": "11h 0m 27s", "loss_scale": 1.0, "consumed_samples": 82432, "global_step/max_steps": "322/6362"} +{"lm loss": 5.58633375, "grad_norm": 1.81376672, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.60047531, "memory(GiB)": 21.49, "elapsed_time": "35m 19s", "remaining_time": "11h 0m 21s", "loss_scale": 1.0, "consumed_samples": 82688, "global_step/max_steps": "323/6362"} +{"lm loss": 5.58913136, "grad_norm": 2.11256886, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.61790586, "memory(GiB)": 21.49, "elapsed_time": "35m 25s", "remaining_time": "11h 0m 16s", "loss_scale": 1.0, "consumed_samples": 82944, "global_step/max_steps": "324/6362"} +{"lm loss": 5.60539436, "grad_norm": 2.10411739, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.71409607, "memory(GiB)": 21.49, "elapsed_time": "35m 32s", "remaining_time": "11h 0m 12s", "loss_scale": 1.0, "consumed_samples": 83200, "global_step/max_steps": "325/6362"} +{"lm loss": 5.58445072, "grad_norm": 2.32563806, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.52521801, "memory(GiB)": 21.49, "elapsed_time": "35m 39s", "remaining_time": "11h 0m 5s", "loss_scale": 1.0, "consumed_samples": 83456, "global_step/max_steps": "326/6362"} +{"lm loss": 5.58300591, "grad_norm": 1.63432777, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.61188579, "memory(GiB)": 21.49, "elapsed_time": "35m 45s", "remaining_time": "10h 59m 59s", "loss_scale": 1.0, "consumed_samples": 83712, "global_step/max_steps": "327/6362"} +{"lm loss": 5.59568977, "grad_norm": 1.75646532, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.65137601, "memory(GiB)": 21.49, "elapsed_time": "35m 52s", "remaining_time": "10h 59m 54s", "loss_scale": 1.0, "consumed_samples": 83968, "global_step/max_steps": "328/6362"} +{"lm loss": 5.57656097, "grad_norm": 1.95569539, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.59008813, "memory(GiB)": 21.49, "elapsed_time": "35m 58s", "remaining_time": "10h 59m 48s", "loss_scale": 1.0, "consumed_samples": 84224, "global_step/max_steps": "329/6362"} +{"lm loss": 5.59103251, "grad_norm": 2.37073159, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.47179651, "memory(GiB)": 21.49, "elapsed_time": "36m 5s", "remaining_time": "10h 59m 40s", "loss_scale": 1.0, "consumed_samples": 84480, "global_step/max_steps": "330/6362"} +{"lm loss": 5.577878, "grad_norm": 1.77970409, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.61284733, "memory(GiB)": 21.49, "elapsed_time": "36m 11s", "remaining_time": "10h 59m 34s", "loss_scale": 1.0, "consumed_samples": 84736, "global_step/max_steps": "331/6362"} +{"lm loss": 5.5692296, "grad_norm": 2.03993344, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.88533878, "memory(GiB)": 21.49, "elapsed_time": "36m 18s", "remaining_time": "10h 59m 33s", "loss_scale": 1.0, "consumed_samples": 84992, "global_step/max_steps": "332/6362"} +{"lm loss": 5.57400751, "grad_norm": 1.68593073, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.44475961, "memory(GiB)": 21.49, "elapsed_time": "36m 25s", "remaining_time": "10h 59m 25s", "loss_scale": 1.0, "consumed_samples": 85248, "global_step/max_steps": "333/6362"} +{"lm loss": 5.58532143, "grad_norm": 2.35035014, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.81264114, "memory(GiB)": 21.49, "elapsed_time": "36m 32s", "remaining_time": "10h 59m 23s", "loss_scale": 1.0, "consumed_samples": 85504, "global_step/max_steps": "334/6362"} +{"lm loss": 5.58121109, "grad_norm": 1.95652092, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.67360663, "memory(GiB)": 21.49, "elapsed_time": "36m 38s", "remaining_time": "10h 59m 18s", "loss_scale": 1.0, "consumed_samples": 85760, "global_step/max_steps": "335/6362"} +{"lm loss": 5.57347775, "grad_norm": 2.10509491, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.57477736, "memory(GiB)": 21.49, "elapsed_time": "36m 45s", "remaining_time": "10h 59m 12s", "loss_scale": 1.0, "consumed_samples": 86016, "global_step/max_steps": "336/6362"} +{"lm loss": 5.58959866, "grad_norm": 1.67575049, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.5391655, "memory(GiB)": 21.49, "elapsed_time": "36m 51s", "remaining_time": "10h 59m 5s", "loss_scale": 1.0, "consumed_samples": 86272, "global_step/max_steps": "337/6362"} +{"lm loss": 5.5867362, "grad_norm": 1.88688838, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.73892045, "memory(GiB)": 21.49, "elapsed_time": "36m 58s", "remaining_time": "10h 59m 1s", "loss_scale": 1.0, "consumed_samples": 86528, "global_step/max_steps": "338/6362"} +{"lm loss": 5.57010365, "grad_norm": 1.86196256, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.63870049, "memory(GiB)": 21.49, "elapsed_time": "37m 5s", "remaining_time": "10h 58m 56s", "loss_scale": 1.0, "consumed_samples": 86784, "global_step/max_steps": "339/6362"} +{"lm loss": 5.5737195, "grad_norm": 1.82316554, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.44958568, "memory(GiB)": 21.49, "elapsed_time": "37m 11s", "remaining_time": "10h 58m 47s", "loss_scale": 1.0, "consumed_samples": 87040, "global_step/max_steps": "340/6362"} +{"lm loss": 5.56318712, "grad_norm": 2.13289356, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.54913163, "memory(GiB)": 21.49, "elapsed_time": "37m 18s", "remaining_time": "10h 58m 41s", "loss_scale": 1.0, "consumed_samples": 87296, "global_step/max_steps": "341/6362"} +{"lm loss": 5.58413839, "grad_norm": 1.77333248, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.3822031, "memory(GiB)": 21.49, "elapsed_time": "37m 24s", "remaining_time": "10h 58m 31s", "loss_scale": 1.0, "consumed_samples": 87552, "global_step/max_steps": "342/6362"} +{"lm loss": 5.58309984, "grad_norm": 2.44940996, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.70509028, "memory(GiB)": 21.49, "elapsed_time": "37m 31s", "remaining_time": "10h 58m 27s", "loss_scale": 1.0, "consumed_samples": 87808, "global_step/max_steps": "343/6362"} +{"lm loss": 5.57738113, "grad_norm": 1.74471223, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.47023344, "memory(GiB)": 21.49, "elapsed_time": "37m 37s", "remaining_time": "10h 58m 19s", "loss_scale": 1.0, "consumed_samples": 88064, "global_step/max_steps": "344/6362"} +{"lm loss": 5.55747175, "grad_norm": 1.77996695, "learning_rate": 0.0001, "elapsed_time_per_iteration": 6.45314455, "memory(GiB)": 21.49, "elapsed_time": "37m 44s", "remaining_time": "10h 58m 10s", "loss_scale": 1.0, "consumed_samples": 88320, "global_step/max_steps": "345/6362"} +{"lm loss": 5.56605673, "grad_norm": 1.49319649, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.69381666, "memory(GiB)": 21.49, "elapsed_time": "37m 50s", "remaining_time": "10h 58m 6s", "loss_scale": 1.0, "consumed_samples": 88576, "global_step/max_steps": "346/6362"} +{"lm loss": 5.56906176, "grad_norm": 2.06750822, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.57887459, "memory(GiB)": 21.49, "elapsed_time": "37m 57s", "remaining_time": "10h 58m 0s", "loss_scale": 1.0, "consumed_samples": 88832, "global_step/max_steps": "347/6362"} +{"lm loss": 5.56208277, "grad_norm": 1.71442759, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.53917265, "memory(GiB)": 21.49, "elapsed_time": "38m 4s", "remaining_time": "10h 57m 53s", "loss_scale": 1.0, "consumed_samples": 89088, "global_step/max_steps": "348/6362"} +{"lm loss": 5.56127787, "grad_norm": 2.21943641, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.40916538, "memory(GiB)": 21.49, "elapsed_time": "38m 10s", "remaining_time": "10h 57m 43s", "loss_scale": 1.0, "consumed_samples": 89344, "global_step/max_steps": "349/6362"} +{"lm loss": 5.56654882, "grad_norm": 1.9501003, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.47412705, "memory(GiB)": 21.49, "elapsed_time": "38m 16s", "remaining_time": "10h 57m 35s", "loss_scale": 1.0, "consumed_samples": 89600, "global_step/max_steps": "350/6362"} +{"lm loss": 5.53625345, "grad_norm": 1.77338731, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.42751169, "memory(GiB)": 21.49, "elapsed_time": "38m 23s", "remaining_time": "10h 57m 26s", "loss_scale": 1.0, "consumed_samples": 89856, "global_step/max_steps": "351/6362"} +{"lm loss": 5.56336164, "grad_norm": 1.78908122, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.59432483, "memory(GiB)": 21.49, "elapsed_time": "38m 30s", "remaining_time": "10h 57m 20s", "loss_scale": 1.0, "consumed_samples": 90112, "global_step/max_steps": "352/6362"} +{"lm loss": 5.53620148, "grad_norm": 1.94392228, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.46989155, "memory(GiB)": 21.49, "elapsed_time": "38m 36s", "remaining_time": "10h 57m 12s", "loss_scale": 1.0, "consumed_samples": 90368, "global_step/max_steps": "353/6362"} +{"lm loss": 5.57695723, "grad_norm": 1.9086622, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.53494406, "memory(GiB)": 21.49, "elapsed_time": "38m 43s", "remaining_time": "10h 57m 5s", "loss_scale": 1.0, "consumed_samples": 90624, "global_step/max_steps": "354/6362"} +{"lm loss": 5.5541687, "grad_norm": 1.85539961, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.40077543, "memory(GiB)": 21.49, "elapsed_time": "38m 49s", "remaining_time": "10h 56m 56s", "loss_scale": 1.0, "consumed_samples": 90880, "global_step/max_steps": "355/6362"} +{"lm loss": 5.53280163, "grad_norm": 1.81205153, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.44723821, "memory(GiB)": 21.49, "elapsed_time": "38m 55s", "remaining_time": "10h 56m 47s", "loss_scale": 1.0, "consumed_samples": 91136, "global_step/max_steps": "356/6362"} +{"lm loss": 5.54365253, "grad_norm": 2.08896637, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.47499871, "memory(GiB)": 21.49, "elapsed_time": "39m 2s", "remaining_time": "10h 56m 39s", "loss_scale": 1.0, "consumed_samples": 91392, "global_step/max_steps": "357/6362"} +{"lm loss": 5.53207111, "grad_norm": 1.74212313, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.46617579, "memory(GiB)": 21.49, "elapsed_time": "39m 8s", "remaining_time": "10h 56m 31s", "loss_scale": 1.0, "consumed_samples": 91648, "global_step/max_steps": "358/6362"} +{"lm loss": 5.53868389, "grad_norm": 1.86073065, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.65090179, "memory(GiB)": 21.49, "elapsed_time": "39m 15s", "remaining_time": "10h 56m 26s", "loss_scale": 1.0, "consumed_samples": 91904, "global_step/max_steps": "359/6362"} +{"lm loss": 5.54945278, "grad_norm": 2.17277288, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.72882724, "memory(GiB)": 21.49, "elapsed_time": "39m 22s", "remaining_time": "10h 56m 22s", "loss_scale": 1.0, "consumed_samples": 92160, "global_step/max_steps": "360/6362"} +{"lm loss": 5.53767776, "grad_norm": 1.70037186, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.48847055, "memory(GiB)": 21.49, "elapsed_time": "39m 28s", "remaining_time": "10h 56m 15s", "loss_scale": 1.0, "consumed_samples": 92416, "global_step/max_steps": "361/6362"} +{"lm loss": 5.54244852, "grad_norm": 1.73173237, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.38018227, "memory(GiB)": 21.49, "elapsed_time": "39m 35s", "remaining_time": "10h 56m 5s", "loss_scale": 1.0, "consumed_samples": 92672, "global_step/max_steps": "362/6362"} +{"lm loss": 5.54768085, "grad_norm": 2.19841623, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.53529835, "memory(GiB)": 21.49, "elapsed_time": "39m 41s", "remaining_time": "10h 55m 58s", "loss_scale": 1.0, "consumed_samples": 92928, "global_step/max_steps": "363/6362"} +{"lm loss": 5.52223349, "grad_norm": 1.56652319, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.50701427, "memory(GiB)": 21.49, "elapsed_time": "39m 48s", "remaining_time": "10h 55m 51s", "loss_scale": 1.0, "consumed_samples": 93184, "global_step/max_steps": "364/6362"} +{"lm loss": 5.5385747, "grad_norm": 2.05997348, "learning_rate": 9.999e-05, "elapsed_time_per_iteration": 6.50115514, "memory(GiB)": 21.49, "elapsed_time": "39m 54s", "remaining_time": "10h 55m 43s", "loss_scale": 1.0, "consumed_samples": 93440, "global_step/max_steps": "365/6362"} +{"lm loss": 5.55434513, "grad_norm": 2.0359416, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.16219473, "memory(GiB)": 21.49, "elapsed_time": "40m 0s", "remaining_time": "10h 55m 30s", "loss_scale": 1.0, "consumed_samples": 93696, "global_step/max_steps": "366/6362"} +{"lm loss": 5.5455184, "grad_norm": 2.095083, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.38130975, "memory(GiB)": 21.49, "elapsed_time": "40m 7s", "remaining_time": "10h 55m 21s", "loss_scale": 1.0, "consumed_samples": 93952, "global_step/max_steps": "367/6362"} +{"lm loss": 5.52364063, "grad_norm": 1.94725037, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.56000805, "memory(GiB)": 21.49, "elapsed_time": "40m 13s", "remaining_time": "10h 55m 14s", "loss_scale": 1.0, "consumed_samples": 94208, "global_step/max_steps": "368/6362"} +{"lm loss": 5.51534033, "grad_norm": 1.76168513, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.49422264, "memory(GiB)": 21.49, "elapsed_time": "40m 20s", "remaining_time": "10h 55m 6s", "loss_scale": 1.0, "consumed_samples": 94464, "global_step/max_steps": "369/6362"} +{"lm loss": 5.52734375, "grad_norm": 1.56191826, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.70956492, "memory(GiB)": 21.49, "elapsed_time": "40m 26s", "remaining_time": "10h 55m 2s", "loss_scale": 1.0, "consumed_samples": 94720, "global_step/max_steps": "370/6362"} +{"lm loss": 5.52099133, "grad_norm": 1.68551767, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.49856615, "memory(GiB)": 21.49, "elapsed_time": "40m 33s", "remaining_time": "10h 54m 55s", "loss_scale": 1.0, "consumed_samples": 94976, "global_step/max_steps": "371/6362"} +{"lm loss": 5.52264547, "grad_norm": 1.8928355, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.76748276, "memory(GiB)": 21.49, "elapsed_time": "40m 40s", "remaining_time": "10h 54m 52s", "loss_scale": 1.0, "consumed_samples": 95232, "global_step/max_steps": "372/6362"} +{"lm loss": 5.53436756, "grad_norm": 1.48364282, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.78353286, "memory(GiB)": 21.49, "elapsed_time": "40m 46s", "remaining_time": "10h 54m 49s", "loss_scale": 1.0, "consumed_samples": 95488, "global_step/max_steps": "373/6362"} +{"lm loss": 5.53921795, "grad_norm": 1.7320168, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.53343415, "memory(GiB)": 21.49, "elapsed_time": "40m 53s", "remaining_time": "10h 54m 42s", "loss_scale": 1.0, "consumed_samples": 95744, "global_step/max_steps": "374/6362"} +{"lm loss": 5.52084017, "grad_norm": 1.68104231, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.61358118, "memory(GiB)": 21.49, "elapsed_time": "41m 0s", "remaining_time": "10h 54m 36s", "loss_scale": 1.0, "consumed_samples": 96000, "global_step/max_steps": "375/6362"} +{"lm loss": 5.52399158, "grad_norm": 1.92210662, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.4727726, "memory(GiB)": 21.49, "elapsed_time": "41m 6s", "remaining_time": "10h 54m 28s", "loss_scale": 1.0, "consumed_samples": 96256, "global_step/max_steps": "376/6362"} +{"lm loss": 5.50246906, "grad_norm": 1.31944716, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.5663271, "memory(GiB)": 21.49, "elapsed_time": "41m 13s", "remaining_time": "10h 54m 21s", "loss_scale": 1.0, "consumed_samples": 96512, "global_step/max_steps": "377/6362"} +{"lm loss": 5.53528833, "grad_norm": 2.09270835, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.74827814, "memory(GiB)": 21.49, "elapsed_time": "41m 19s", "remaining_time": "10h 54m 18s", "loss_scale": 1.0, "consumed_samples": 96768, "global_step/max_steps": "378/6362"} +{"lm loss": 5.5212822, "grad_norm": 1.48967206, "learning_rate": 9.998e-05, "elapsed_time_per_iteration": 6.49727345, "memory(GiB)": 21.49, "elapsed_time": "41m 26s", "remaining_time": "10h 54m 10s", "loss_scale": 1.0, "consumed_samples": 97024, "global_step/max_steps": "379/6362"} +{"lm loss": 5.52853155, "grad_norm": 2.20281124, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.71258259, "memory(GiB)": 21.49, "elapsed_time": "41m 33s", "remaining_time": "10h 54m 6s", "loss_scale": 1.0, "consumed_samples": 97280, "global_step/max_steps": "380/6362"} +{"lm loss": 5.50824404, "grad_norm": 1.76232886, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.62216687, "memory(GiB)": 21.49, "elapsed_time": "41m 39s", "remaining_time": "10h 54m 1s", "loss_scale": 1.0, "consumed_samples": 97536, "global_step/max_steps": "381/6362"} +{"lm loss": 5.51461697, "grad_norm": 1.63985002, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.46537519, "memory(GiB)": 21.49, "elapsed_time": "41m 46s", "remaining_time": "10h 53m 52s", "loss_scale": 1.0, "consumed_samples": 97792, "global_step/max_steps": "382/6362"} +{"lm loss": 5.51547909, "grad_norm": 2.0075562, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.56599021, "memory(GiB)": 21.49, "elapsed_time": "41m 52s", "remaining_time": "10h 53m 46s", "loss_scale": 1.0, "consumed_samples": 98048, "global_step/max_steps": "383/6362"} +{"lm loss": 5.54006529, "grad_norm": 1.8806299, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.57411885, "memory(GiB)": 21.49, "elapsed_time": "41m 59s", "remaining_time": "10h 53m 40s", "loss_scale": 1.0, "consumed_samples": 98304, "global_step/max_steps": "384/6362"} +{"lm loss": 5.51282883, "grad_norm": 1.95363379, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.53540373, "memory(GiB)": 21.49, "elapsed_time": "42m 5s", "remaining_time": "10h 53m 33s", "loss_scale": 1.0, "consumed_samples": 98560, "global_step/max_steps": "385/6362"} +{"lm loss": 5.50610495, "grad_norm": 1.54951656, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.67492247, "memory(GiB)": 21.49, "elapsed_time": "42m 12s", "remaining_time": "10h 53m 28s", "loss_scale": 1.0, "consumed_samples": 98816, "global_step/max_steps": "386/6362"} +{"lm loss": 5.53604412, "grad_norm": 2.24163961, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.46551561, "memory(GiB)": 21.49, "elapsed_time": "42m 19s", "remaining_time": "10h 53m 20s", "loss_scale": 1.0, "consumed_samples": 99072, "global_step/max_steps": "387/6362"} +{"lm loss": 5.51573086, "grad_norm": 1.42039955, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.58318686, "memory(GiB)": 21.49, "elapsed_time": "42m 25s", "remaining_time": "10h 53m 14s", "loss_scale": 1.0, "consumed_samples": 99328, "global_step/max_steps": "388/6362"} +{"lm loss": 5.53184414, "grad_norm": 2.32888436, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.45189738, "memory(GiB)": 21.49, "elapsed_time": "42m 32s", "remaining_time": "10h 53m 5s", "loss_scale": 1.0, "consumed_samples": 99584, "global_step/max_steps": "389/6362"} +{"lm loss": 5.51758766, "grad_norm": 1.76547527, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.46469593, "memory(GiB)": 21.49, "elapsed_time": "42m 38s", "remaining_time": "10h 52m 57s", "loss_scale": 1.0, "consumed_samples": 99840, "global_step/max_steps": "390/6362"} +{"lm loss": 5.50901699, "grad_norm": 1.7911768, "learning_rate": 9.997e-05, "elapsed_time_per_iteration": 6.72817135, "memory(GiB)": 21.49, "elapsed_time": "42m 45s", "remaining_time": "10h 52m 53s", "loss_scale": 1.0, "consumed_samples": 100096, "global_step/max_steps": "391/6362"} +{"lm loss": 5.51308298, "grad_norm": 1.37316573, "learning_rate": 9.996e-05, "elapsed_time_per_iteration": 6.55210662, "memory(GiB)": 21.49, "elapsed_time": "42m 51s", "remaining_time": "10h 52m 47s", "loss_scale": 1.0, "consumed_samples": 100352, "global_step/max_steps": "392/6362"} +{"lm loss": 5.52286053, "grad_norm": 2.31983137, "learning_rate": 9.996e-05, "elapsed_time_per_iteration": 6.6217196, "memory(GiB)": 21.49, "elapsed_time": "42m 58s", "remaining_time": "10h 52m 41s", "loss_scale": 1.0, "consumed_samples": 100608, "global_step/max_steps": "393/6362"} +{"lm loss": 5.51743746, "grad_norm": 1.376665, "learning_rate": 9.996e-05, "elapsed_time_per_iteration": 6.4175384, "memory(GiB)": 21.49, "elapsed_time": "43m 4s", "remaining_time": "10h 52m 32s", "loss_scale": 1.0, "consumed_samples": 100864, "global_step/max_steps": "394/6362"} +{"lm loss": 5.49106789, "grad_norm": 1.97162104, "learning_rate": 9.996e-05, "elapsed_time_per_iteration": 6.80474925, "memory(GiB)": 21.49, "elapsed_time": "43m 11s", "remaining_time": "10h 52m 29s", "loss_scale": 1.0, "consumed_samples": 101120, "global_step/max_steps": "395/6362"} +{"lm loss": 5.51726723, "grad_norm": 1.43240762, "learning_rate": 9.996e-05, "elapsed_time_per_iteration": 6.62457252, "memory(GiB)": 21.49, "elapsed_time": "43m 18s", "remaining_time": "10h 52m 24s", "loss_scale": 1.0, "consumed_samples": 101376, "global_step/max_steps": "396/6362"} +{"lm loss": 5.50386763, "grad_norm": 1.61791515, "learning_rate": 9.996e-05, "elapsed_time_per_iteration": 6.6038506, "memory(GiB)": 21.49, "elapsed_time": "43m 24s", "remaining_time": "10h 52m 18s", "loss_scale": 1.0, "consumed_samples": 101632, "global_step/max_steps": "397/6362"} +{"lm loss": 5.50050735, "grad_norm": 1.95597756, "learning_rate": 9.996e-05, "elapsed_time_per_iteration": 6.61729026, "memory(GiB)": 21.49, "elapsed_time": "43m 31s", "remaining_time": "10h 52m 12s", "loss_scale": 1.0, "consumed_samples": 101888, "global_step/max_steps": "398/6362"} +{"lm loss": 5.5088563, "grad_norm": 1.42861891, "learning_rate": 9.996e-05, "elapsed_time_per_iteration": 6.68164062, "memory(GiB)": 21.49, "elapsed_time": "43m 38s", "remaining_time": "10h 52m 7s", "loss_scale": 1.0, "consumed_samples": 102144, "global_step/max_steps": "399/6362"} +{"lm loss": 5.51632166, "grad_norm": 1.55472171, "learning_rate": 9.996e-05, "elapsed_time_per_iteration": 6.77448249, "memory(GiB)": 21.49, "elapsed_time": "43m 44s", "remaining_time": "10h 52m 4s", "loss_scale": 1.0, "consumed_samples": 102400, "global_step/max_steps": "400/6362"} +{"lm loss": 5.51963234, "grad_norm": 1.59097087, "learning_rate": 9.995e-05, "elapsed_time_per_iteration": 6.49645019, "memory(GiB)": 21.49, "elapsed_time": "43m 51s", "remaining_time": "10h 51m 57s", "loss_scale": 1.0, "consumed_samples": 102656, "global_step/max_steps": "401/6362"} +{"lm loss": 5.51418781, "grad_norm": 1.68827784, "learning_rate": 9.995e-05, "elapsed_time_per_iteration": 6.5216918, "memory(GiB)": 21.49, "elapsed_time": "43m 57s", "remaining_time": "10h 51m 49s", "loss_scale": 1.0, "consumed_samples": 102912, "global_step/max_steps": "402/6362"} +{"lm loss": 5.49836969, "grad_norm": 1.87693429, "learning_rate": 9.995e-05, "elapsed_time_per_iteration": 6.76107121, "memory(GiB)": 21.49, "elapsed_time": "44m 4s", "remaining_time": "10h 51m 46s", "loss_scale": 1.0, "consumed_samples": 103168, "global_step/max_steps": "403/6362"} +{"lm loss": 5.51061773, "grad_norm": 1.96697271, "learning_rate": 9.995e-05, "elapsed_time_per_iteration": 6.75012922, "memory(GiB)": 21.49, "elapsed_time": "44m 11s", "remaining_time": "10h 51m 42s", "loss_scale": 1.0, "consumed_samples": 103424, "global_step/max_steps": "404/6362"} +{"lm loss": 5.49702454, "grad_norm": 1.70727897, "learning_rate": 9.995e-05, "elapsed_time_per_iteration": 6.49869108, "memory(GiB)": 21.49, "elapsed_time": "44m 17s", "remaining_time": "10h 51m 34s", "loss_scale": 1.0, "consumed_samples": 103680, "global_step/max_steps": "405/6362"} +{"lm loss": 5.49927664, "grad_norm": 1.18566489, "learning_rate": 9.995e-05, "elapsed_time_per_iteration": 6.73545742, "memory(GiB)": 21.49, "elapsed_time": "44m 24s", "remaining_time": "10h 51m 30s", "loss_scale": 1.0, "consumed_samples": 103936, "global_step/max_steps": "406/6362"} +{"lm loss": 5.48061323, "grad_norm": 1.71959639, "learning_rate": 9.995e-05, "elapsed_time_per_iteration": 6.65059042, "memory(GiB)": 21.49, "elapsed_time": "44m 31s", "remaining_time": "10h 51m 25s", "loss_scale": 1.0, "consumed_samples": 104192, "global_step/max_steps": "407/6362"} +{"lm loss": 5.47276926, "grad_norm": 1.39931798, "learning_rate": 9.995e-05, "elapsed_time_per_iteration": 6.54369569, "memory(GiB)": 21.49, "elapsed_time": "44m 37s", "remaining_time": "10h 51m 18s", "loss_scale": 1.0, "consumed_samples": 104448, "global_step/max_steps": "408/6362"} +{"lm loss": 5.48749924, "grad_norm": 1.56331825, "learning_rate": 9.995e-05, "elapsed_time_per_iteration": 6.58504534, "memory(GiB)": 21.49, "elapsed_time": "44m 44s", "remaining_time": "10h 51m 12s", "loss_scale": 1.0, "consumed_samples": 104704, "global_step/max_steps": "409/6362"} +{"lm loss": 5.49761295, "grad_norm": 1.60844135, "learning_rate": 9.994e-05, "elapsed_time_per_iteration": 6.57128739, "memory(GiB)": 21.49, "elapsed_time": "44m 51s", "remaining_time": "10h 51m 6s", "loss_scale": 1.0, "consumed_samples": 104960, "global_step/max_steps": "410/6362"} +{"lm loss": 5.49657249, "grad_norm": 1.58402097, "learning_rate": 9.994e-05, "elapsed_time_per_iteration": 6.66907144, "memory(GiB)": 21.49, "elapsed_time": "44m 57s", "remaining_time": "10h 51m 1s", "loss_scale": 1.0, "consumed_samples": 105216, "global_step/max_steps": "411/6362"} +{"lm loss": 5.50208902, "grad_norm": 1.7249552, "learning_rate": 9.994e-05, "elapsed_time_per_iteration": 6.51552629, "memory(GiB)": 21.49, "elapsed_time": "45m 4s", "remaining_time": "10h 50m 53s", "loss_scale": 1.0, "consumed_samples": 105472, "global_step/max_steps": "412/6362"} +{"lm loss": 5.48815536, "grad_norm": 1.73268902, "learning_rate": 9.994e-05, "elapsed_time_per_iteration": 6.43161559, "memory(GiB)": 21.49, "elapsed_time": "45m 10s", "remaining_time": "10h 50m 45s", "loss_scale": 1.0, "consumed_samples": 105728, "global_step/max_steps": "413/6362"} +{"lm loss": 5.48803663, "grad_norm": 1.76079321, "learning_rate": 9.994e-05, "elapsed_time_per_iteration": 6.6383779, "memory(GiB)": 21.49, "elapsed_time": "45m 17s", "remaining_time": "10h 50m 39s", "loss_scale": 1.0, "consumed_samples": 105984, "global_step/max_steps": "414/6362"} +{"lm loss": 5.46983528, "grad_norm": 1.70965981, "learning_rate": 9.994e-05, "elapsed_time_per_iteration": 6.6473124, "memory(GiB)": 21.49, "elapsed_time": "45m 23s", "remaining_time": "10h 50m 34s", "loss_scale": 1.0, "consumed_samples": 106240, "global_step/max_steps": "415/6362"} +{"lm loss": 5.48259354, "grad_norm": 1.62127125, "learning_rate": 9.994e-05, "elapsed_time_per_iteration": 6.71497226, "memory(GiB)": 21.49, "elapsed_time": "45m 30s", "remaining_time": "10h 50m 30s", "loss_scale": 1.0, "consumed_samples": 106496, "global_step/max_steps": "416/6362"} +{"lm loss": 5.49941206, "grad_norm": 1.96917427, "learning_rate": 9.994e-05, "elapsed_time_per_iteration": 6.42244601, "memory(GiB)": 21.49, "elapsed_time": "45m 37s", "remaining_time": "10h 50m 21s", "loss_scale": 1.0, "consumed_samples": 106752, "global_step/max_steps": "417/6362"} +{"lm loss": 5.49232292, "grad_norm": 1.54277039, "learning_rate": 9.993e-05, "elapsed_time_per_iteration": 6.63970876, "memory(GiB)": 21.49, "elapsed_time": "45m 43s", "remaining_time": "10h 50m 15s", "loss_scale": 1.0, "consumed_samples": 107008, "global_step/max_steps": "418/6362"} +{"lm loss": 5.50413513, "grad_norm": 1.76740146, "learning_rate": 9.993e-05, "elapsed_time_per_iteration": 6.80118346, "memory(GiB)": 21.49, "elapsed_time": "45m 50s", "remaining_time": "10h 50m 12s", "loss_scale": 1.0, "consumed_samples": 107264, "global_step/max_steps": "419/6362"} +{"lm loss": 5.47100782, "grad_norm": 1.75138164, "learning_rate": 9.993e-05, "elapsed_time_per_iteration": 6.50505066, "memory(GiB)": 21.49, "elapsed_time": "45m 57s", "remaining_time": "10h 50m 5s", "loss_scale": 1.0, "consumed_samples": 107520, "global_step/max_steps": "420/6362"} +{"lm loss": 5.47570467, "grad_norm": 1.820436, "learning_rate": 9.993e-05, "elapsed_time_per_iteration": 6.601578, "memory(GiB)": 21.49, "elapsed_time": "46m 3s", "remaining_time": "10h 49m 59s", "loss_scale": 1.0, "consumed_samples": 107776, "global_step/max_steps": "421/6362"} +{"lm loss": 5.48539877, "grad_norm": 1.44651997, "learning_rate": 9.993e-05, "elapsed_time_per_iteration": 6.60260773, "memory(GiB)": 21.49, "elapsed_time": "46m 10s", "remaining_time": "10h 49m 53s", "loss_scale": 1.0, "consumed_samples": 108032, "global_step/max_steps": "422/6362"} +{"lm loss": 5.46373367, "grad_norm": 1.81060469, "learning_rate": 9.993e-05, "elapsed_time_per_iteration": 6.76592803, "memory(GiB)": 21.49, "elapsed_time": "46m 16s", "remaining_time": "10h 49m 49s", "loss_scale": 1.0, "consumed_samples": 108288, "global_step/max_steps": "423/6362"} +{"lm loss": 5.48475122, "grad_norm": 1.37724984, "learning_rate": 9.993e-05, "elapsed_time_per_iteration": 6.68177748, "memory(GiB)": 21.49, "elapsed_time": "46m 23s", "remaining_time": "10h 49m 44s", "loss_scale": 1.0, "consumed_samples": 108544, "global_step/max_steps": "424/6362"} +{"lm loss": 5.46834898, "grad_norm": 1.89009905, "learning_rate": 9.993e-05, "elapsed_time_per_iteration": 6.4977386, "memory(GiB)": 21.49, "elapsed_time": "46m 30s", "remaining_time": "10h 49m 37s", "loss_scale": 1.0, "consumed_samples": 108800, "global_step/max_steps": "425/6362"} +{"lm loss": 5.47721577, "grad_norm": 1.49433339, "learning_rate": 9.992e-05, "elapsed_time_per_iteration": 6.70982885, "memory(GiB)": 21.49, "elapsed_time": "46m 36s", "remaining_time": "10h 49m 32s", "loss_scale": 1.0, "consumed_samples": 109056, "global_step/max_steps": "426/6362"} +{"lm loss": 5.48452473, "grad_norm": 1.65973485, "learning_rate": 9.992e-05, "elapsed_time_per_iteration": 6.47670889, "memory(GiB)": 21.49, "elapsed_time": "46m 43s", "remaining_time": "10h 49m 24s", "loss_scale": 1.0, "consumed_samples": 109312, "global_step/max_steps": "427/6362"} +{"lm loss": 5.48596907, "grad_norm": 1.76461375, "learning_rate": 9.992e-05, "elapsed_time_per_iteration": 6.62908649, "memory(GiB)": 21.49, "elapsed_time": "46m 49s", "remaining_time": "10h 49m 19s", "loss_scale": 1.0, "consumed_samples": 109568, "global_step/max_steps": "428/6362"} +{"lm loss": 5.47139978, "grad_norm": 1.59344161, "learning_rate": 9.992e-05, "elapsed_time_per_iteration": 6.50700092, "memory(GiB)": 21.49, "elapsed_time": "46m 56s", "remaining_time": "10h 49m 11s", "loss_scale": 1.0, "consumed_samples": 109824, "global_step/max_steps": "429/6362"} +{"lm loss": 5.46788454, "grad_norm": 1.43380606, "learning_rate": 9.992e-05, "elapsed_time_per_iteration": 6.78045917, "memory(GiB)": 21.49, "elapsed_time": "47m 3s", "remaining_time": "10h 49m 8s", "loss_scale": 1.0, "consumed_samples": 110080, "global_step/max_steps": "430/6362"} +{"lm loss": 5.49473476, "grad_norm": 1.92955792, "learning_rate": 9.992e-05, "elapsed_time_per_iteration": 6.50799274, "memory(GiB)": 21.49, "elapsed_time": "47m 9s", "remaining_time": "10h 49m 0s", "loss_scale": 1.0, "consumed_samples": 110336, "global_step/max_steps": "431/6362"} +{"lm loss": 5.47913027, "grad_norm": 1.82343519, "learning_rate": 9.992e-05, "elapsed_time_per_iteration": 6.61070085, "memory(GiB)": 21.49, "elapsed_time": "47m 16s", "remaining_time": "10h 48m 54s", "loss_scale": 1.0, "consumed_samples": 110592, "global_step/max_steps": "432/6362"} +{"lm loss": 5.45553398, "grad_norm": 1.79076302, "learning_rate": 9.991e-05, "elapsed_time_per_iteration": 6.68552542, "memory(GiB)": 21.49, "elapsed_time": "47m 23s", "remaining_time": "10h 48m 49s", "loss_scale": 1.0, "consumed_samples": 110848, "global_step/max_steps": "433/6362"} +{"lm loss": 5.45301628, "grad_norm": 1.30756867, "learning_rate": 9.991e-05, "elapsed_time_per_iteration": 6.43409443, "memory(GiB)": 21.49, "elapsed_time": "47m 29s", "remaining_time": "10h 48m 41s", "loss_scale": 1.0, "consumed_samples": 111104, "global_step/max_steps": "434/6362"} +{"lm loss": 5.48752785, "grad_norm": 1.61550415, "learning_rate": 9.991e-05, "elapsed_time_per_iteration": 6.54710269, "memory(GiB)": 21.49, "elapsed_time": "47m 36s", "remaining_time": "10h 48m 34s", "loss_scale": 1.0, "consumed_samples": 111360, "global_step/max_steps": "435/6362"} +{"lm loss": 5.46222448, "grad_norm": 1.35725856, "learning_rate": 9.991e-05, "elapsed_time_per_iteration": 6.55085921, "memory(GiB)": 21.49, "elapsed_time": "47m 42s", "remaining_time": "10h 48m 27s", "loss_scale": 1.0, "consumed_samples": 111616, "global_step/max_steps": "436/6362"} +{"lm loss": 5.48521233, "grad_norm": 1.92870307, "learning_rate": 9.991e-05, "elapsed_time_per_iteration": 6.39544392, "memory(GiB)": 21.49, "elapsed_time": "47m 49s", "remaining_time": "10h 48m 19s", "loss_scale": 1.0, "consumed_samples": 111872, "global_step/max_steps": "437/6362"} +{"lm loss": 5.45435762, "grad_norm": 1.17062974, "learning_rate": 9.991e-05, "elapsed_time_per_iteration": 6.50671577, "memory(GiB)": 21.49, "elapsed_time": "47m 55s", "remaining_time": "10h 48m 11s", "loss_scale": 1.0, "consumed_samples": 112128, "global_step/max_steps": "438/6362"} +{"lm loss": 5.4728384, "grad_norm": 1.73429549, "learning_rate": 9.99e-05, "elapsed_time_per_iteration": 6.59114075, "memory(GiB)": 21.49, "elapsed_time": "48m 2s", "remaining_time": "10h 48m 5s", "loss_scale": 1.0, "consumed_samples": 112384, "global_step/max_steps": "439/6362"} +{"lm loss": 5.44770718, "grad_norm": 1.73384666, "learning_rate": 9.99e-05, "elapsed_time_per_iteration": 6.63170719, "memory(GiB)": 21.49, "elapsed_time": "48m 8s", "remaining_time": "10h 47m 59s", "loss_scale": 1.0, "consumed_samples": 112640, "global_step/max_steps": "440/6362"} +{"lm loss": 5.48213625, "grad_norm": 1.45515108, "learning_rate": 9.99e-05, "elapsed_time_per_iteration": 6.42576241, "memory(GiB)": 21.49, "elapsed_time": "48m 15s", "remaining_time": "10h 47m 51s", "loss_scale": 1.0, "consumed_samples": 112896, "global_step/max_steps": "441/6362"} +{"lm loss": 5.47214937, "grad_norm": 1.52661884, "learning_rate": 9.99e-05, "elapsed_time_per_iteration": 6.59470534, "memory(GiB)": 21.49, "elapsed_time": "48m 21s", "remaining_time": "10h 47m 45s", "loss_scale": 1.0, "consumed_samples": 113152, "global_step/max_steps": "442/6362"} +{"lm loss": 5.46562624, "grad_norm": 1.56233823, "learning_rate": 9.99e-05, "elapsed_time_per_iteration": 6.44207764, "memory(GiB)": 21.49, "elapsed_time": "48m 28s", "remaining_time": "10h 47m 37s", "loss_scale": 1.0, "consumed_samples": 113408, "global_step/max_steps": "443/6362"} +{"lm loss": 5.44276142, "grad_norm": 1.90496027, "learning_rate": 9.99e-05, "elapsed_time_per_iteration": 6.65520954, "memory(GiB)": 21.49, "elapsed_time": "48m 34s", "remaining_time": "10h 47m 31s", "loss_scale": 1.0, "consumed_samples": 113664, "global_step/max_steps": "444/6362"} +{"lm loss": 5.45588064, "grad_norm": 1.46172023, "learning_rate": 9.989e-05, "elapsed_time_per_iteration": 6.54658031, "memory(GiB)": 21.49, "elapsed_time": "48m 41s", "remaining_time": "10h 47m 24s", "loss_scale": 1.0, "consumed_samples": 113920, "global_step/max_steps": "445/6362"} +{"lm loss": 5.46061993, "grad_norm": 1.98468709, "learning_rate": 9.989e-05, "elapsed_time_per_iteration": 6.56056499, "memory(GiB)": 21.49, "elapsed_time": "48m 47s", "remaining_time": "10h 47m 18s", "loss_scale": 1.0, "consumed_samples": 114176, "global_step/max_steps": "446/6362"} +{"lm loss": 5.44684124, "grad_norm": 1.27254856, "learning_rate": 9.989e-05, "elapsed_time_per_iteration": 6.60058069, "memory(GiB)": 21.49, "elapsed_time": "48m 54s", "remaining_time": "10h 47m 12s", "loss_scale": 1.0, "consumed_samples": 114432, "global_step/max_steps": "447/6362"} +{"lm loss": 5.44050169, "grad_norm": 1.68414414, "learning_rate": 9.989e-05, "elapsed_time_per_iteration": 6.68632269, "memory(GiB)": 21.49, "elapsed_time": "49m 1s", "remaining_time": "10h 47m 7s", "loss_scale": 1.0, "consumed_samples": 114688, "global_step/max_steps": "448/6362"} +{"lm loss": 5.46648693, "grad_norm": 1.53955293, "learning_rate": 9.989e-05, "elapsed_time_per_iteration": 6.51093078, "memory(GiB)": 21.49, "elapsed_time": "49m 7s", "remaining_time": "10h 46m 59s", "loss_scale": 1.0, "consumed_samples": 114944, "global_step/max_steps": "449/6362"} +{"lm loss": 5.47114325, "grad_norm": 1.46879327, "learning_rate": 9.989e-05, "elapsed_time_per_iteration": 6.49754524, "memory(GiB)": 21.49, "elapsed_time": "49m 14s", "remaining_time": "10h 46m 52s", "loss_scale": 1.0, "consumed_samples": 115200, "global_step/max_steps": "450/6362"} +{"lm loss": 5.45222569, "grad_norm": 1.4147315, "learning_rate": 9.988e-05, "elapsed_time_per_iteration": 6.44826198, "memory(GiB)": 21.49, "elapsed_time": "49m 20s", "remaining_time": "10h 46m 44s", "loss_scale": 1.0, "consumed_samples": 115456, "global_step/max_steps": "451/6362"} +{"lm loss": 5.46208668, "grad_norm": 1.79118073, "learning_rate": 9.988e-05, "elapsed_time_per_iteration": 6.75425577, "memory(GiB)": 21.49, "elapsed_time": "49m 27s", "remaining_time": "10h 46m 40s", "loss_scale": 1.0, "consumed_samples": 115712, "global_step/max_steps": "452/6362"} +{"lm loss": 5.45399427, "grad_norm": 1.46198952, "learning_rate": 9.988e-05, "elapsed_time_per_iteration": 6.67883778, "memory(GiB)": 21.49, "elapsed_time": "49m 34s", "remaining_time": "10h 46m 35s", "loss_scale": 1.0, "consumed_samples": 115968, "global_step/max_steps": "453/6362"} +{"lm loss": 5.45424795, "grad_norm": 1.70592642, "learning_rate": 9.988e-05, "elapsed_time_per_iteration": 6.39887309, "memory(GiB)": 21.49, "elapsed_time": "49m 40s", "remaining_time": "10h 46m 26s", "loss_scale": 1.0, "consumed_samples": 116224, "global_step/max_steps": "454/6362"} +{"lm loss": 5.46454334, "grad_norm": 1.62259531, "learning_rate": 9.988e-05, "elapsed_time_per_iteration": 6.55047297, "memory(GiB)": 21.49, "elapsed_time": "49m 47s", "remaining_time": "10h 46m 19s", "loss_scale": 1.0, "consumed_samples": 116480, "global_step/max_steps": "455/6362"} +{"lm loss": 5.44246483, "grad_norm": 1.58306229, "learning_rate": 9.988e-05, "elapsed_time_per_iteration": 6.60689378, "memory(GiB)": 21.49, "elapsed_time": "49m 53s", "remaining_time": "10h 46m 13s", "loss_scale": 1.0, "consumed_samples": 116736, "global_step/max_steps": "456/6362"} +{"lm loss": 5.44198227, "grad_norm": 1.44440198, "learning_rate": 9.987e-05, "elapsed_time_per_iteration": 6.33391094, "memory(GiB)": 21.49, "elapsed_time": "50m 0s", "remaining_time": "10h 46m 4s", "loss_scale": 1.0, "consumed_samples": 116992, "global_step/max_steps": "457/6362"} +{"lm loss": 5.46885681, "grad_norm": 1.82683933, "learning_rate": 9.987e-05, "elapsed_time_per_iteration": 6.68757677, "memory(GiB)": 21.49, "elapsed_time": "50m 6s", "remaining_time": "10h 45m 59s", "loss_scale": 1.0, "consumed_samples": 117248, "global_step/max_steps": "458/6362"} +{"lm loss": 5.45155382, "grad_norm": 1.220631, "learning_rate": 9.987e-05, "elapsed_time_per_iteration": 6.51256537, "memory(GiB)": 21.49, "elapsed_time": "50m 13s", "remaining_time": "10h 45m 51s", "loss_scale": 1.0, "consumed_samples": 117504, "global_step/max_steps": "459/6362"} +{"lm loss": 5.430161, "grad_norm": 2.00340986, "learning_rate": 9.987e-05, "elapsed_time_per_iteration": 6.72548652, "memory(GiB)": 21.49, "elapsed_time": "50m 19s", "remaining_time": "10h 45m 47s", "loss_scale": 1.0, "consumed_samples": 117760, "global_step/max_steps": "460/6362"} +{"lm loss": 5.45637941, "grad_norm": 1.20502508, "learning_rate": 9.987e-05, "elapsed_time_per_iteration": 6.61852574, "memory(GiB)": 21.49, "elapsed_time": "50m 26s", "remaining_time": "10h 45m 41s", "loss_scale": 1.0, "consumed_samples": 118016, "global_step/max_steps": "461/6362"} +{"lm loss": 5.45836687, "grad_norm": 1.61453056, "learning_rate": 9.986e-05, "elapsed_time_per_iteration": 6.73683238, "memory(GiB)": 21.49, "elapsed_time": "50m 33s", "remaining_time": "10h 45m 37s", "loss_scale": 1.0, "consumed_samples": 118272, "global_step/max_steps": "462/6362"} +{"lm loss": 5.44620562, "grad_norm": 1.45803678, "learning_rate": 9.986e-05, "elapsed_time_per_iteration": 6.64176774, "memory(GiB)": 21.49, "elapsed_time": "50m 39s", "remaining_time": "10h 45m 31s", "loss_scale": 1.0, "consumed_samples": 118528, "global_step/max_steps": "463/6362"} +{"lm loss": 5.43459797, "grad_norm": 1.63861132, "learning_rate": 9.986e-05, "elapsed_time_per_iteration": 6.64540195, "memory(GiB)": 21.49, "elapsed_time": "50m 46s", "remaining_time": "10h 45m 25s", "loss_scale": 1.0, "consumed_samples": 118784, "global_step/max_steps": "464/6362"} +{"lm loss": 5.44262838, "grad_norm": 1.76545489, "learning_rate": 9.986e-05, "elapsed_time_per_iteration": 6.64977264, "memory(GiB)": 21.49, "elapsed_time": "50m 53s", "remaining_time": "10h 45m 20s", "loss_scale": 1.0, "consumed_samples": 119040, "global_step/max_steps": "465/6362"} +{"lm loss": 5.4166832, "grad_norm": 1.40900552, "learning_rate": 9.986e-05, "elapsed_time_per_iteration": 6.48122358, "memory(GiB)": 21.49, "elapsed_time": "50m 59s", "remaining_time": "10h 45m 12s", "loss_scale": 1.0, "consumed_samples": 119296, "global_step/max_steps": "466/6362"} +{"lm loss": 5.43384457, "grad_norm": 1.5264411, "learning_rate": 9.985e-05, "elapsed_time_per_iteration": 6.40366101, "memory(GiB)": 21.49, "elapsed_time": "51m 6s", "remaining_time": "10h 45m 4s", "loss_scale": 1.0, "consumed_samples": 119552, "global_step/max_steps": "467/6362"} +{"lm loss": 5.43699026, "grad_norm": 1.74966657, "learning_rate": 9.985e-05, "elapsed_time_per_iteration": 6.42090225, "memory(GiB)": 21.49, "elapsed_time": "51m 12s", "remaining_time": "10h 44m 55s", "loss_scale": 1.0, "consumed_samples": 119808, "global_step/max_steps": "468/6362"} +{"lm loss": 5.44686985, "grad_norm": 1.48786545, "learning_rate": 9.985e-05, "elapsed_time_per_iteration": 6.42073178, "memory(GiB)": 21.49, "elapsed_time": "51m 18s", "remaining_time": "10h 44m 47s", "loss_scale": 1.0, "consumed_samples": 120064, "global_step/max_steps": "469/6362"} +{"lm loss": 5.43579531, "grad_norm": 1.78617406, "learning_rate": 9.985e-05, "elapsed_time_per_iteration": 6.44034481, "memory(GiB)": 21.49, "elapsed_time": "51m 25s", "remaining_time": "10h 44m 39s", "loss_scale": 1.0, "consumed_samples": 120320, "global_step/max_steps": "470/6362"} +{"lm loss": 5.45895958, "grad_norm": 1.13185239, "learning_rate": 9.985e-05, "elapsed_time_per_iteration": 6.90395212, "memory(GiB)": 21.49, "elapsed_time": "51m 32s", "remaining_time": "10h 44m 37s", "loss_scale": 1.0, "consumed_samples": 120576, "global_step/max_steps": "471/6362"} +{"lm loss": 5.4514842, "grad_norm": 2.03702617, "learning_rate": 9.984e-05, "elapsed_time_per_iteration": 6.70969105, "memory(GiB)": 21.49, "elapsed_time": "51m 39s", "remaining_time": "10h 44m 32s", "loss_scale": 1.0, "consumed_samples": 120832, "global_step/max_steps": "472/6362"} +{"lm loss": 5.41615772, "grad_norm": 1.23307037, "learning_rate": 9.984e-05, "elapsed_time_per_iteration": 6.60784435, "memory(GiB)": 21.49, "elapsed_time": "51m 45s", "remaining_time": "10h 44m 26s", "loss_scale": 1.0, "consumed_samples": 121088, "global_step/max_steps": "473/6362"} +{"lm loss": 5.42427492, "grad_norm": 1.48575377, "learning_rate": 9.984e-05, "elapsed_time_per_iteration": 6.6007905, "memory(GiB)": 21.49, "elapsed_time": "51m 52s", "remaining_time": "10h 44m 20s", "loss_scale": 1.0, "consumed_samples": 121344, "global_step/max_steps": "474/6362"} +{"lm loss": 5.44088745, "grad_norm": 1.5646944, "learning_rate": 9.984e-05, "elapsed_time_per_iteration": 6.64202332, "memory(GiB)": 21.49, "elapsed_time": "51m 58s", "remaining_time": "10h 44m 14s", "loss_scale": 1.0, "consumed_samples": 121600, "global_step/max_steps": "475/6362"} +{"lm loss": 5.41867638, "grad_norm": 1.44290054, "learning_rate": 9.984e-05, "elapsed_time_per_iteration": 6.4876194, "memory(GiB)": 21.49, "elapsed_time": "52m 5s", "remaining_time": "10h 44m 6s", "loss_scale": 1.0, "consumed_samples": 121856, "global_step/max_steps": "476/6362"} +{"lm loss": 5.42140198, "grad_norm": 1.63182831, "learning_rate": 9.983e-05, "elapsed_time_per_iteration": 6.67657781, "memory(GiB)": 21.49, "elapsed_time": "52m 12s", "remaining_time": "10h 44m 1s", "loss_scale": 1.0, "consumed_samples": 122112, "global_step/max_steps": "477/6362"} +{"lm loss": 5.43727732, "grad_norm": 1.42147028, "learning_rate": 9.983e-05, "elapsed_time_per_iteration": 6.44656682, "memory(GiB)": 21.49, "elapsed_time": "52m 18s", "remaining_time": "10h 43m 53s", "loss_scale": 1.0, "consumed_samples": 122368, "global_step/max_steps": "478/6362"} +{"lm loss": 5.4375391, "grad_norm": 1.28623736, "learning_rate": 9.983e-05, "elapsed_time_per_iteration": 6.63078332, "memory(GiB)": 21.49, "elapsed_time": "52m 25s", "remaining_time": "10h 43m 47s", "loss_scale": 1.0, "consumed_samples": 122624, "global_step/max_steps": "479/6362"} +{"lm loss": 5.43738794, "grad_norm": 1.46134531, "learning_rate": 9.983e-05, "elapsed_time_per_iteration": 6.69102502, "memory(GiB)": 21.49, "elapsed_time": "52m 31s", "remaining_time": "10h 43m 42s", "loss_scale": 1.0, "consumed_samples": 122880, "global_step/max_steps": "480/6362"} +{"lm loss": 5.4673996, "grad_norm": 1.42742264, "learning_rate": 9.983e-05, "elapsed_time_per_iteration": 6.60385585, "memory(GiB)": 21.49, "elapsed_time": "52m 38s", "remaining_time": "10h 43m 36s", "loss_scale": 1.0, "consumed_samples": 123136, "global_step/max_steps": "481/6362"} +{"lm loss": 5.43920994, "grad_norm": 1.57131147, "learning_rate": 9.982e-05, "elapsed_time_per_iteration": 6.34185076, "memory(GiB)": 21.49, "elapsed_time": "52m 44s", "remaining_time": "10h 43m 27s", "loss_scale": 1.0, "consumed_samples": 123392, "global_step/max_steps": "482/6362"} +{"lm loss": 5.42473984, "grad_norm": 1.67896473, "learning_rate": 9.982e-05, "elapsed_time_per_iteration": 6.54006195, "memory(GiB)": 21.49, "elapsed_time": "52m 51s", "remaining_time": "10h 43m 20s", "loss_scale": 1.0, "consumed_samples": 123648, "global_step/max_steps": "483/6362"} +{"lm loss": 5.4292469, "grad_norm": 1.36602604, "learning_rate": 9.982e-05, "elapsed_time_per_iteration": 6.64597082, "memory(GiB)": 21.49, "elapsed_time": "52m 57s", "remaining_time": "10h 43m 14s", "loss_scale": 1.0, "consumed_samples": 123904, "global_step/max_steps": "484/6362"} +{"lm loss": 5.43121195, "grad_norm": 1.51818204, "learning_rate": 9.982e-05, "elapsed_time_per_iteration": 6.62076306, "memory(GiB)": 21.49, "elapsed_time": "53m 4s", "remaining_time": "10h 43m 9s", "loss_scale": 1.0, "consumed_samples": 124160, "global_step/max_steps": "485/6362"} +{"lm loss": 5.4379406, "grad_norm": 1.15250444, "learning_rate": 9.982e-05, "elapsed_time_per_iteration": 6.57009315, "memory(GiB)": 21.49, "elapsed_time": "53m 11s", "remaining_time": "10h 43m 2s", "loss_scale": 1.0, "consumed_samples": 124416, "global_step/max_steps": "486/6362"} +{"lm loss": 5.43233013, "grad_norm": 1.74116325, "learning_rate": 9.981e-05, "elapsed_time_per_iteration": 6.63754082, "memory(GiB)": 21.49, "elapsed_time": "53m 17s", "remaining_time": "10h 42m 56s", "loss_scale": 1.0, "consumed_samples": 124672, "global_step/max_steps": "487/6362"} +{"lm loss": 5.43526077, "grad_norm": 1.59746361, "learning_rate": 9.981e-05, "elapsed_time_per_iteration": 6.79174662, "memory(GiB)": 21.49, "elapsed_time": "53m 24s", "remaining_time": "10h 42m 53s", "loss_scale": 1.0, "consumed_samples": 124928, "global_step/max_steps": "488/6362"} +{"lm loss": 5.41233683, "grad_norm": 1.73415709, "learning_rate": 9.981e-05, "elapsed_time_per_iteration": 6.77449369, "memory(GiB)": 21.49, "elapsed_time": "53m 31s", "remaining_time": "10h 42m 48s", "loss_scale": 1.0, "consumed_samples": 125184, "global_step/max_steps": "489/6362"} +{"lm loss": 5.4207859, "grad_norm": 1.52151752, "learning_rate": 9.981e-05, "elapsed_time_per_iteration": 6.52877426, "memory(GiB)": 21.49, "elapsed_time": "53m 37s", "remaining_time": "10h 42m 41s", "loss_scale": 1.0, "consumed_samples": 125440, "global_step/max_steps": "490/6362"} +{"lm loss": 5.40115976, "grad_norm": 1.46936333, "learning_rate": 9.98e-05, "elapsed_time_per_iteration": 6.6352663, "memory(GiB)": 21.49, "elapsed_time": "53m 44s", "remaining_time": "10h 42m 36s", "loss_scale": 1.0, "consumed_samples": 125696, "global_step/max_steps": "491/6362"} +{"lm loss": 5.42895937, "grad_norm": 1.30826414, "learning_rate": 9.98e-05, "elapsed_time_per_iteration": 6.67373729, "memory(GiB)": 21.49, "elapsed_time": "53m 51s", "remaining_time": "10h 42m 30s", "loss_scale": 1.0, "consumed_samples": 125952, "global_step/max_steps": "492/6362"} +{"lm loss": 5.41676092, "grad_norm": 1.46152687, "learning_rate": 9.98e-05, "elapsed_time_per_iteration": 6.62094617, "memory(GiB)": 21.49, "elapsed_time": "53m 57s", "remaining_time": "10h 42m 24s", "loss_scale": 1.0, "consumed_samples": 126208, "global_step/max_steps": "493/6362"} +{"lm loss": 5.42993975, "grad_norm": 1.53315663, "learning_rate": 9.98e-05, "elapsed_time_per_iteration": 6.71221447, "memory(GiB)": 21.49, "elapsed_time": "54m 4s", "remaining_time": "10h 42m 20s", "loss_scale": 1.0, "consumed_samples": 126464, "global_step/max_steps": "494/6362"} +{"lm loss": 5.41144848, "grad_norm": 1.14646327, "learning_rate": 9.98e-05, "elapsed_time_per_iteration": 6.57774138, "memory(GiB)": 21.49, "elapsed_time": "54m 11s", "remaining_time": "10h 42m 13s", "loss_scale": 1.0, "consumed_samples": 126720, "global_step/max_steps": "495/6362"} +{"lm loss": 5.40111828, "grad_norm": 1.65303767, "learning_rate": 9.979e-05, "elapsed_time_per_iteration": 6.52309322, "memory(GiB)": 21.49, "elapsed_time": "54m 17s", "remaining_time": "10h 42m 6s", "loss_scale": 1.0, "consumed_samples": 126976, "global_step/max_steps": "496/6362"} +{"lm loss": 5.40091562, "grad_norm": 1.15194404, "learning_rate": 9.979e-05, "elapsed_time_per_iteration": 6.53940988, "memory(GiB)": 21.49, "elapsed_time": "54m 24s", "remaining_time": "10h 41m 59s", "loss_scale": 1.0, "consumed_samples": 127232, "global_step/max_steps": "497/6362"} +{"lm loss": 5.41061258, "grad_norm": 1.7064836, "learning_rate": 9.979e-05, "elapsed_time_per_iteration": 6.50189304, "memory(GiB)": 21.49, "elapsed_time": "54m 30s", "remaining_time": "10h 41m 52s", "loss_scale": 1.0, "consumed_samples": 127488, "global_step/max_steps": "498/6362"} +{"lm loss": 5.41758108, "grad_norm": 1.20118117, "learning_rate": 9.979e-05, "elapsed_time_per_iteration": 6.57692099, "memory(GiB)": 21.49, "elapsed_time": "54m 37s", "remaining_time": "10h 41m 45s", "loss_scale": 1.0, "consumed_samples": 127744, "global_step/max_steps": "499/6362"} +{"lm loss": 5.40875006, "grad_norm": 1.63647985, "learning_rate": 9.978e-05, "elapsed_time_per_iteration": 6.36848927, "memory(GiB)": 21.49, "elapsed_time": "54m 43s", "remaining_time": "10h 41m 36s", "loss_scale": 1.0, "consumed_samples": 128000, "global_step/max_steps": "500/6362"} +{"lm loss": 5.4078455, "grad_norm": 1.25797033, "learning_rate": 9.978e-05, "elapsed_time_per_iteration": 6.53336024, "memory(GiB)": 21.49, "elapsed_time": "54m 50s", "remaining_time": "10h 41m 29s", "loss_scale": 1.0, "consumed_samples": 128256, "global_step/max_steps": "501/6362"} +{"lm loss": 5.41220236, "grad_norm": 1.49108148, "learning_rate": 9.978e-05, "elapsed_time_per_iteration": 6.64234471, "memory(GiB)": 21.49, "elapsed_time": "54m 56s", "remaining_time": "10h 41m 24s", "loss_scale": 1.0, "consumed_samples": 128512, "global_step/max_steps": "502/6362"} +{"lm loss": 5.41487598, "grad_norm": 1.61180449, "learning_rate": 9.978e-05, "elapsed_time_per_iteration": 6.56368303, "memory(GiB)": 21.49, "elapsed_time": "55m 3s", "remaining_time": "10h 41m 17s", "loss_scale": 1.0, "consumed_samples": 128768, "global_step/max_steps": "503/6362"} +{"lm loss": 5.41133595, "grad_norm": 1.3874892, "learning_rate": 9.977e-05, "elapsed_time_per_iteration": 6.54569697, "memory(GiB)": 21.49, "elapsed_time": "55m 9s", "remaining_time": "10h 41m 10s", "loss_scale": 1.0, "consumed_samples": 129024, "global_step/max_steps": "504/6362"} +{"lm loss": 5.40940905, "grad_norm": 1.551296, "learning_rate": 9.977e-05, "elapsed_time_per_iteration": 6.64114475, "memory(GiB)": 21.49, "elapsed_time": "55m 16s", "remaining_time": "10h 41m 5s", "loss_scale": 1.0, "consumed_samples": 129280, "global_step/max_steps": "505/6362"} +{"lm loss": 5.40841103, "grad_norm": 1.25394475, "learning_rate": 9.977e-05, "elapsed_time_per_iteration": 6.742836, "memory(GiB)": 21.49, "elapsed_time": "55m 23s", "remaining_time": "10h 41m 0s", "loss_scale": 1.0, "consumed_samples": 129536, "global_step/max_steps": "506/6362"} +{"lm loss": 5.41708469, "grad_norm": 1.43996835, "learning_rate": 9.977e-05, "elapsed_time_per_iteration": 6.60327506, "memory(GiB)": 21.49, "elapsed_time": "55m 29s", "remaining_time": "10h 40m 54s", "loss_scale": 1.0, "consumed_samples": 129792, "global_step/max_steps": "507/6362"} +{"lm loss": 5.41542149, "grad_norm": 1.48389912, "learning_rate": 9.976e-05, "elapsed_time_per_iteration": 6.6981678, "memory(GiB)": 21.49, "elapsed_time": "55m 36s", "remaining_time": "10h 40m 49s", "loss_scale": 1.0, "consumed_samples": 130048, "global_step/max_steps": "508/6362"} +{"lm loss": 5.39169693, "grad_norm": 1.14732754, "learning_rate": 9.976e-05, "elapsed_time_per_iteration": 6.4817059, "memory(GiB)": 21.49, "elapsed_time": "55m 43s", "remaining_time": "10h 40m 41s", "loss_scale": 1.0, "consumed_samples": 130304, "global_step/max_steps": "509/6362"} +{"lm loss": 5.43196344, "grad_norm": 1.63585877, "learning_rate": 9.976e-05, "elapsed_time_per_iteration": 6.37727976, "memory(GiB)": 21.49, "elapsed_time": "55m 49s", "remaining_time": "10h 40m 33s", "loss_scale": 1.0, "consumed_samples": 130560, "global_step/max_steps": "510/6362"} +{"lm loss": 5.40655708, "grad_norm": 1.18664801, "learning_rate": 9.976e-05, "elapsed_time_per_iteration": 6.60948658, "memory(GiB)": 21.49, "elapsed_time": "55m 56s", "remaining_time": "10h 40m 26s", "loss_scale": 1.0, "consumed_samples": 130816, "global_step/max_steps": "511/6362"} +{"lm loss": 5.40800285, "grad_norm": 1.5535748, "learning_rate": 9.975e-05, "elapsed_time_per_iteration": 6.49623895, "memory(GiB)": 21.49, "elapsed_time": "56m 2s", "remaining_time": "10h 40m 19s", "loss_scale": 1.0, "consumed_samples": 131072, "global_step/max_steps": "512/6362"} +{"lm loss": 5.40599871, "grad_norm": 1.33940458, "learning_rate": 9.975e-05, "elapsed_time_per_iteration": 6.68157959, "memory(GiB)": 21.49, "elapsed_time": "56m 9s", "remaining_time": "10h 40m 14s", "loss_scale": 1.0, "consumed_samples": 131328, "global_step/max_steps": "513/6362"} +{"lm loss": 5.39783239, "grad_norm": 1.50566685, "learning_rate": 9.975e-05, "elapsed_time_per_iteration": 6.63689995, "memory(GiB)": 21.49, "elapsed_time": "56m 15s", "remaining_time": "10h 40m 8s", "loss_scale": 1.0, "consumed_samples": 131584, "global_step/max_steps": "514/6362"} +{"lm loss": 5.41675711, "grad_norm": 1.09710872, "learning_rate": 9.975e-05, "elapsed_time_per_iteration": 6.50028443, "memory(GiB)": 21.49, "elapsed_time": "56m 22s", "remaining_time": "10h 40m 1s", "loss_scale": 1.0, "consumed_samples": 131840, "global_step/max_steps": "515/6362"} +{"lm loss": 5.41369677, "grad_norm": 1.62614262, "learning_rate": 9.974e-05, "elapsed_time_per_iteration": 6.6403172, "memory(GiB)": 21.49, "elapsed_time": "56m 28s", "remaining_time": "10h 39m 55s", "loss_scale": 1.0, "consumed_samples": 132096, "global_step/max_steps": "516/6362"} +{"lm loss": 5.41256332, "grad_norm": 2.04277182, "learning_rate": 9.974e-05, "elapsed_time_per_iteration": 6.73354435, "memory(GiB)": 21.49, "elapsed_time": "56m 35s", "remaining_time": "10h 39m 50s", "loss_scale": 1.0, "consumed_samples": 132352, "global_step/max_steps": "517/6362"} +{"lm loss": 5.39925957, "grad_norm": 1.2256279, "learning_rate": 9.974e-05, "elapsed_time_per_iteration": 6.47480488, "memory(GiB)": 21.49, "elapsed_time": "56m 42s", "remaining_time": "10h 39m 43s", "loss_scale": 1.0, "consumed_samples": 132608, "global_step/max_steps": "518/6362"} +{"lm loss": 5.402637, "grad_norm": 1.822752, "learning_rate": 9.974e-05, "elapsed_time_per_iteration": 6.41403747, "memory(GiB)": 21.49, "elapsed_time": "56m 48s", "remaining_time": "10h 39m 34s", "loss_scale": 1.0, "consumed_samples": 132864, "global_step/max_steps": "519/6362"} +{"lm loss": 5.41406631, "grad_norm": 1.23326385, "learning_rate": 9.973e-05, "elapsed_time_per_iteration": 6.48150635, "memory(GiB)": 21.49, "elapsed_time": "56m 55s", "remaining_time": "10h 39m 27s", "loss_scale": 1.0, "consumed_samples": 133120, "global_step/max_steps": "520/6362"} +{"lm loss": 5.40427446, "grad_norm": 1.84724176, "learning_rate": 9.973e-05, "elapsed_time_per_iteration": 6.48664498, "memory(GiB)": 21.49, "elapsed_time": "57m 1s", "remaining_time": "10h 39m 19s", "loss_scale": 1.0, "consumed_samples": 133376, "global_step/max_steps": "521/6362"} +{"lm loss": 5.39119625, "grad_norm": 1.29797757, "learning_rate": 9.973e-05, "elapsed_time_per_iteration": 6.7338295, "memory(GiB)": 21.49, "elapsed_time": "57m 8s", "remaining_time": "10h 39m 15s", "loss_scale": 1.0, "consumed_samples": 133632, "global_step/max_steps": "522/6362"} +{"lm loss": 5.42173958, "grad_norm": 1.57966626, "learning_rate": 9.973e-05, "elapsed_time_per_iteration": 6.59780765, "memory(GiB)": 21.49, "elapsed_time": "57m 14s", "remaining_time": "10h 39m 8s", "loss_scale": 1.0, "consumed_samples": 133888, "global_step/max_steps": "523/6362"} +{"lm loss": 5.39264774, "grad_norm": 1.45429039, "learning_rate": 9.972e-05, "elapsed_time_per_iteration": 6.59718871, "memory(GiB)": 21.49, "elapsed_time": "57m 21s", "remaining_time": "10h 39m 2s", "loss_scale": 1.0, "consumed_samples": 134144, "global_step/max_steps": "524/6362"} +{"lm loss": 5.39558554, "grad_norm": 1.45300913, "learning_rate": 9.972e-05, "elapsed_time_per_iteration": 6.42765522, "memory(GiB)": 21.49, "elapsed_time": "57m 27s", "remaining_time": "10h 38m 54s", "loss_scale": 1.0, "consumed_samples": 134400, "global_step/max_steps": "525/6362"} +{"lm loss": 5.39219093, "grad_norm": 1.17562246, "learning_rate": 9.972e-05, "elapsed_time_per_iteration": 6.45653629, "memory(GiB)": 21.49, "elapsed_time": "57m 34s", "remaining_time": "10h 38m 46s", "loss_scale": 1.0, "consumed_samples": 134656, "global_step/max_steps": "526/6362"} +{"lm loss": 5.40979958, "grad_norm": 1.64934862, "learning_rate": 9.971e-05, "elapsed_time_per_iteration": 6.5174818, "memory(GiB)": 21.49, "elapsed_time": "57m 40s", "remaining_time": "10h 38m 39s", "loss_scale": 1.0, "consumed_samples": 134912, "global_step/max_steps": "527/6362"} +{"lm loss": 5.40197515, "grad_norm": 1.23710775, "learning_rate": 9.971e-05, "elapsed_time_per_iteration": 6.53247714, "memory(GiB)": 21.49, "elapsed_time": "57m 47s", "remaining_time": "10h 38m 32s", "loss_scale": 1.0, "consumed_samples": 135168, "global_step/max_steps": "528/6362"} +{"lm loss": 5.37528419, "grad_norm": 1.65114295, "learning_rate": 9.971e-05, "elapsed_time_per_iteration": 6.63625145, "memory(GiB)": 21.49, "elapsed_time": "57m 54s", "remaining_time": "10h 38m 26s", "loss_scale": 1.0, "consumed_samples": 135424, "global_step/max_steps": "529/6362"} +{"lm loss": 5.3933444, "grad_norm": 1.27551794, "learning_rate": 9.971e-05, "elapsed_time_per_iteration": 6.52337742, "memory(GiB)": 21.49, "elapsed_time": "58m 0s", "remaining_time": "10h 38m 19s", "loss_scale": 1.0, "consumed_samples": 135680, "global_step/max_steps": "530/6362"} +{"lm loss": 5.39099073, "grad_norm": 1.59040689, "learning_rate": 9.97e-05, "elapsed_time_per_iteration": 6.55740476, "memory(GiB)": 21.49, "elapsed_time": "58m 7s", "remaining_time": "10h 38m 13s", "loss_scale": 1.0, "consumed_samples": 135936, "global_step/max_steps": "531/6362"} +{"lm loss": 5.3994174, "grad_norm": 1.49250412, "learning_rate": 9.97e-05, "elapsed_time_per_iteration": 6.53144789, "memory(GiB)": 21.49, "elapsed_time": "58m 13s", "remaining_time": "10h 38m 6s", "loss_scale": 1.0, "consumed_samples": 136192, "global_step/max_steps": "532/6362"} +{"lm loss": 5.41091108, "grad_norm": 1.2714268, "learning_rate": 9.97e-05, "elapsed_time_per_iteration": 6.53178382, "memory(GiB)": 21.49, "elapsed_time": "58m 20s", "remaining_time": "10h 37m 59s", "loss_scale": 1.0, "consumed_samples": 136448, "global_step/max_steps": "533/6362"} +{"lm loss": 5.38376665, "grad_norm": 1.2333169, "learning_rate": 9.969e-05, "elapsed_time_per_iteration": 6.50604939, "memory(GiB)": 21.49, "elapsed_time": "58m 26s", "remaining_time": "10h 37m 52s", "loss_scale": 1.0, "consumed_samples": 136704, "global_step/max_steps": "534/6362"} +{"lm loss": 5.41302919, "grad_norm": 1.47944295, "learning_rate": 9.969e-05, "elapsed_time_per_iteration": 6.58252692, "memory(GiB)": 21.49, "elapsed_time": "58m 33s", "remaining_time": "10h 37m 45s", "loss_scale": 1.0, "consumed_samples": 136960, "global_step/max_steps": "535/6362"} +{"lm loss": 5.39853621, "grad_norm": 1.33718979, "learning_rate": 9.969e-05, "elapsed_time_per_iteration": 6.48590565, "memory(GiB)": 21.49, "elapsed_time": "58m 39s", "remaining_time": "10h 37m 38s", "loss_scale": 1.0, "consumed_samples": 137216, "global_step/max_steps": "536/6362"} +{"lm loss": 5.39875174, "grad_norm": 1.63654053, "learning_rate": 9.969e-05, "elapsed_time_per_iteration": 6.49761057, "memory(GiB)": 21.49, "elapsed_time": "58m 46s", "remaining_time": "10h 37m 30s", "loss_scale": 1.0, "consumed_samples": 137472, "global_step/max_steps": "537/6362"} +{"lm loss": 5.3788619, "grad_norm": 1.35036266, "learning_rate": 9.968e-05, "elapsed_time_per_iteration": 6.55696797, "memory(GiB)": 21.49, "elapsed_time": "58m 52s", "remaining_time": "10h 37m 24s", "loss_scale": 1.0, "consumed_samples": 137728, "global_step/max_steps": "538/6362"} +{"lm loss": 5.39310551, "grad_norm": 1.58270311, "learning_rate": 9.968e-05, "elapsed_time_per_iteration": 6.69187593, "memory(GiB)": 21.49, "elapsed_time": "58m 59s", "remaining_time": "10h 37m 18s", "loss_scale": 1.0, "consumed_samples": 137984, "global_step/max_steps": "539/6362"} +{"lm loss": 5.39763927, "grad_norm": 1.08676863, "learning_rate": 9.968e-05, "elapsed_time_per_iteration": 6.55666614, "memory(GiB)": 21.49, "elapsed_time": "59m 6s", "remaining_time": "10h 37m 12s", "loss_scale": 1.0, "consumed_samples": 138240, "global_step/max_steps": "540/6362"} +{"lm loss": 5.40778065, "grad_norm": 1.34994733, "learning_rate": 9.967e-05, "elapsed_time_per_iteration": 6.41348791, "memory(GiB)": 21.49, "elapsed_time": "59m 12s", "remaining_time": "10h 37m 4s", "loss_scale": 1.0, "consumed_samples": 138496, "global_step/max_steps": "541/6362"} +{"lm loss": 5.3644805, "grad_norm": 1.33957517, "learning_rate": 9.967e-05, "elapsed_time_per_iteration": 6.61277485, "memory(GiB)": 21.49, "elapsed_time": "59m 19s", "remaining_time": "10h 36m 57s", "loss_scale": 1.0, "consumed_samples": 138752, "global_step/max_steps": "542/6362"} +{"lm loss": 5.40079021, "grad_norm": 1.23066866, "learning_rate": 9.967e-05, "elapsed_time_per_iteration": 6.59481859, "memory(GiB)": 21.49, "elapsed_time": "59m 25s", "remaining_time": "10h 36m 51s", "loss_scale": 1.0, "consumed_samples": 139008, "global_step/max_steps": "543/6362"} +{"lm loss": 5.39364481, "grad_norm": 1.44660902, "learning_rate": 9.967e-05, "elapsed_time_per_iteration": 6.41994095, "memory(GiB)": 21.49, "elapsed_time": "59m 32s", "remaining_time": "10h 36m 43s", "loss_scale": 1.0, "consumed_samples": 139264, "global_step/max_steps": "544/6362"} +{"lm loss": 5.39911461, "grad_norm": 1.43399823, "learning_rate": 9.966e-05, "elapsed_time_per_iteration": 6.52195549, "memory(GiB)": 21.49, "elapsed_time": "59m 38s", "remaining_time": "10h 36m 36s", "loss_scale": 1.0, "consumed_samples": 139520, "global_step/max_steps": "545/6362"} +{"lm loss": 5.38323736, "grad_norm": 1.4528178, "learning_rate": 9.966e-05, "elapsed_time_per_iteration": 6.55749202, "memory(GiB)": 21.49, "elapsed_time": "59m 45s", "remaining_time": "10h 36m 29s", "loss_scale": 1.0, "consumed_samples": 139776, "global_step/max_steps": "546/6362"} +{"lm loss": 5.37861109, "grad_norm": 1.54251766, "learning_rate": 9.966e-05, "elapsed_time_per_iteration": 6.5296967, "memory(GiB)": 21.49, "elapsed_time": "59m 51s", "remaining_time": "10h 36m 22s", "loss_scale": 1.0, "consumed_samples": 140032, "global_step/max_steps": "547/6362"} +{"lm loss": 5.38244581, "grad_norm": 1.1884352, "learning_rate": 9.965e-05, "elapsed_time_per_iteration": 6.57493687, "memory(GiB)": 21.49, "elapsed_time": "59m 58s", "remaining_time": "10h 36m 16s", "loss_scale": 1.0, "consumed_samples": 140288, "global_step/max_steps": "548/6362"} +{"lm loss": 5.37054014, "grad_norm": 1.40941906, "learning_rate": 9.965e-05, "elapsed_time_per_iteration": 6.50748205, "memory(GiB)": 21.49, "elapsed_time": "1h 0m 4s", "remaining_time": "10h 36m 9s", "loss_scale": 1.0, "consumed_samples": 140544, "global_step/max_steps": "549/6362"} +{"lm loss": 5.3842206, "grad_norm": 1.30380774, "learning_rate": 9.965e-05, "elapsed_time_per_iteration": 6.49776793, "memory(GiB)": 21.49, "elapsed_time": "1h 0m 11s", "remaining_time": "10h 36m 1s", "loss_scale": 1.0, "consumed_samples": 140800, "global_step/max_steps": "550/6362"} +{"lm loss": 5.36842299, "grad_norm": 1.59080088, "learning_rate": 9.965e-05, "elapsed_time_per_iteration": 6.46581769, "memory(GiB)": 21.49, "elapsed_time": "1h 0m 17s", "remaining_time": "10h 35m 54s", "loss_scale": 1.0, "consumed_samples": 141056, "global_step/max_steps": "551/6362"} +{"lm loss": 5.37076759, "grad_norm": 1.29508185, "learning_rate": 9.964e-05, "elapsed_time_per_iteration": 6.58665514, "memory(GiB)": 21.49, "elapsed_time": "1h 0m 24s", "remaining_time": "10h 35m 48s", "loss_scale": 1.0, "consumed_samples": 141312, "global_step/max_steps": "552/6362"} +{"lm loss": 5.363585, "grad_norm": 1.44861865, "learning_rate": 9.964e-05, "elapsed_time_per_iteration": 6.84543371, "memory(GiB)": 21.49, "elapsed_time": "1h 0m 31s", "remaining_time": "10h 35m 44s", "loss_scale": 1.0, "consumed_samples": 141568, "global_step/max_steps": "553/6362"} +{"lm loss": 5.37660313, "grad_norm": 1.33077037, "learning_rate": 9.964e-05, "elapsed_time_per_iteration": 6.73388004, "memory(GiB)": 21.49, "elapsed_time": "1h 0m 37s", "remaining_time": "10h 35m 39s", "loss_scale": 1.0, "consumed_samples": 141824, "global_step/max_steps": "554/6362"} +{"lm loss": 5.38343382, "grad_norm": 1.47205472, "learning_rate": 9.963e-05, "elapsed_time_per_iteration": 6.65711761, "memory(GiB)": 21.49, "elapsed_time": "1h 0m 44s", "remaining_time": "10h 35m 33s", "loss_scale": 1.0, "consumed_samples": 142080, "global_step/max_steps": "555/6362"} +{"lm loss": 5.39062452, "grad_norm": 1.4255445, "learning_rate": 9.963e-05, "elapsed_time_per_iteration": 6.52675128, "memory(GiB)": 21.49, "elapsed_time": "1h 0m 51s", "remaining_time": "10h 35m 26s", "loss_scale": 1.0, "consumed_samples": 142336, "global_step/max_steps": "556/6362"} +{"lm loss": 5.36103821, "grad_norm": 1.43560064, "learning_rate": 9.963e-05, "elapsed_time_per_iteration": 6.81292272, "memory(GiB)": 21.49, "elapsed_time": "1h 0m 57s", "remaining_time": "10h 35m 22s", "loss_scale": 1.0, "consumed_samples": 142592, "global_step/max_steps": "557/6362"} +{"lm loss": 5.37036276, "grad_norm": 1.54565811, "learning_rate": 9.962e-05, "elapsed_time_per_iteration": 6.7106545, "memory(GiB)": 21.49, "elapsed_time": "1h 1m 4s", "remaining_time": "10h 35m 17s", "loss_scale": 1.0, "consumed_samples": 142848, "global_step/max_steps": "558/6362"} +{"lm loss": 5.37528896, "grad_norm": 1.39744449, "learning_rate": 9.962e-05, "elapsed_time_per_iteration": 6.61251807, "memory(GiB)": 21.49, "elapsed_time": "1h 1m 11s", "remaining_time": "10h 35m 11s", "loss_scale": 1.0, "consumed_samples": 143104, "global_step/max_steps": "559/6362"} +{"lm loss": 5.35856295, "grad_norm": 1.24462759, "learning_rate": 9.962e-05, "elapsed_time_per_iteration": 6.59155083, "memory(GiB)": 21.49, "elapsed_time": "1h 1m 17s", "remaining_time": "10h 35m 5s", "loss_scale": 1.0, "consumed_samples": 143360, "global_step/max_steps": "560/6362"} +{"lm loss": 5.37723541, "grad_norm": 1.42906272, "learning_rate": 9.961e-05, "elapsed_time_per_iteration": 6.43345785, "memory(GiB)": 21.49, "elapsed_time": "1h 1m 24s", "remaining_time": "10h 34m 57s", "loss_scale": 1.0, "consumed_samples": 143616, "global_step/max_steps": "561/6362"} +{"lm loss": 5.37005997, "grad_norm": 1.45674646, "learning_rate": 9.961e-05, "elapsed_time_per_iteration": 6.49474478, "memory(GiB)": 21.49, "elapsed_time": "1h 1m 30s", "remaining_time": "10h 34m 50s", "loss_scale": 1.0, "consumed_samples": 143872, "global_step/max_steps": "562/6362"} +{"lm loss": 5.37368488, "grad_norm": 1.15942204, "learning_rate": 9.961e-05, "elapsed_time_per_iteration": 6.32667208, "memory(GiB)": 21.49, "elapsed_time": "1h 1m 37s", "remaining_time": "10h 34m 41s", "loss_scale": 1.0, "consumed_samples": 144128, "global_step/max_steps": "563/6362"} +{"lm loss": 5.36203527, "grad_norm": 1.45679498, "learning_rate": 9.96e-05, "elapsed_time_per_iteration": 6.73183846, "memory(GiB)": 21.49, "elapsed_time": "1h 1m 43s", "remaining_time": "10h 34m 36s", "loss_scale": 1.0, "consumed_samples": 144384, "global_step/max_steps": "564/6362"} +{"lm loss": 5.36327982, "grad_norm": 1.2603327, "learning_rate": 9.96e-05, "elapsed_time_per_iteration": 6.55071139, "memory(GiB)": 21.49, "elapsed_time": "1h 1m 50s", "remaining_time": "10h 34m 29s", "loss_scale": 1.0, "consumed_samples": 144640, "global_step/max_steps": "565/6362"} +{"lm loss": 5.37196255, "grad_norm": 1.43250179, "learning_rate": 9.96e-05, "elapsed_time_per_iteration": 6.36121607, "memory(GiB)": 21.49, "elapsed_time": "1h 1m 56s", "remaining_time": "10h 34m 20s", "loss_scale": 1.0, "consumed_samples": 144896, "global_step/max_steps": "566/6362"} +{"lm loss": 5.38247347, "grad_norm": 1.26107323, "learning_rate": 9.959e-05, "elapsed_time_per_iteration": 6.4941237, "memory(GiB)": 21.49, "elapsed_time": "1h 2m 3s", "remaining_time": "10h 34m 13s", "loss_scale": 1.0, "consumed_samples": 145152, "global_step/max_steps": "567/6362"} +{"lm loss": 5.37403011, "grad_norm": 1.40909433, "learning_rate": 9.959e-05, "elapsed_time_per_iteration": 6.58855677, "memory(GiB)": 21.49, "elapsed_time": "1h 2m 9s", "remaining_time": "10h 34m 7s", "loss_scale": 1.0, "consumed_samples": 145408, "global_step/max_steps": "568/6362"} +{"lm loss": 5.37033033, "grad_norm": 1.28974581, "learning_rate": 9.959e-05, "elapsed_time_per_iteration": 6.56969404, "memory(GiB)": 21.49, "elapsed_time": "1h 2m 16s", "remaining_time": "10h 34m 0s", "loss_scale": 1.0, "consumed_samples": 145664, "global_step/max_steps": "569/6362"} +{"lm loss": 5.36307764, "grad_norm": 1.40646911, "learning_rate": 9.958e-05, "elapsed_time_per_iteration": 6.54944372, "memory(GiB)": 21.49, "elapsed_time": "1h 2m 22s", "remaining_time": "10h 33m 53s", "loss_scale": 1.0, "consumed_samples": 145920, "global_step/max_steps": "570/6362"} +{"lm loss": 5.3716898, "grad_norm": 1.19145548, "learning_rate": 9.958e-05, "elapsed_time_per_iteration": 6.55145693, "memory(GiB)": 21.49, "elapsed_time": "1h 2m 29s", "remaining_time": "10h 33m 47s", "loss_scale": 1.0, "consumed_samples": 146176, "global_step/max_steps": "571/6362"} +{"lm loss": 5.36098385, "grad_norm": 1.57776952, "learning_rate": 9.958e-05, "elapsed_time_per_iteration": 6.778929, "memory(GiB)": 21.49, "elapsed_time": "1h 2m 36s", "remaining_time": "10h 33m 42s", "loss_scale": 1.0, "consumed_samples": 146432, "global_step/max_steps": "572/6362"} +{"lm loss": 5.38669872, "grad_norm": 1.00783539, "learning_rate": 9.957e-05, "elapsed_time_per_iteration": 6.43907166, "memory(GiB)": 21.49, "elapsed_time": "1h 2m 42s", "remaining_time": "10h 33m 34s", "loss_scale": 1.0, "consumed_samples": 146688, "global_step/max_steps": "573/6362"} +{"lm loss": 5.37715912, "grad_norm": 1.63899779, "learning_rate": 9.957e-05, "elapsed_time_per_iteration": 6.52846932, "memory(GiB)": 21.49, "elapsed_time": "1h 2m 49s", "remaining_time": "10h 33m 27s", "loss_scale": 1.0, "consumed_samples": 146944, "global_step/max_steps": "574/6362"} +{"lm loss": 5.36509132, "grad_norm": 1.17709982, "learning_rate": 9.957e-05, "elapsed_time_per_iteration": 6.6212585, "memory(GiB)": 21.49, "elapsed_time": "1h 2m 55s", "remaining_time": "10h 33m 21s", "loss_scale": 1.0, "consumed_samples": 147200, "global_step/max_steps": "575/6362"} +{"lm loss": 5.3930192, "grad_norm": 1.59049451, "learning_rate": 9.956e-05, "elapsed_time_per_iteration": 6.90335608, "memory(GiB)": 21.49, "elapsed_time": "1h 3m 2s", "remaining_time": "10h 33m 18s", "loss_scale": 1.0, "consumed_samples": 147456, "global_step/max_steps": "576/6362"} +{"lm loss": 5.37312412, "grad_norm": 1.50383341, "learning_rate": 9.956e-05, "elapsed_time_per_iteration": 6.60485196, "memory(GiB)": 21.49, "elapsed_time": "1h 3m 9s", "remaining_time": "10h 33m 12s", "loss_scale": 1.0, "consumed_samples": 147712, "global_step/max_steps": "577/6362"} +{"lm loss": 5.38316154, "grad_norm": 1.42842758, "learning_rate": 9.956e-05, "elapsed_time_per_iteration": 6.48326612, "memory(GiB)": 21.49, "elapsed_time": "1h 3m 15s", "remaining_time": "10h 33m 5s", "loss_scale": 1.0, "consumed_samples": 147968, "global_step/max_steps": "578/6362"} +{"lm loss": 5.37354231, "grad_norm": 1.25776815, "learning_rate": 9.955e-05, "elapsed_time_per_iteration": 6.44114137, "memory(GiB)": 21.49, "elapsed_time": "1h 3m 22s", "remaining_time": "10h 32m 57s", "loss_scale": 1.0, "consumed_samples": 148224, "global_step/max_steps": "579/6362"} +{"lm loss": 5.36120176, "grad_norm": 1.65327263, "learning_rate": 9.955e-05, "elapsed_time_per_iteration": 6.63104749, "memory(GiB)": 21.49, "elapsed_time": "1h 3m 28s", "remaining_time": "10h 32m 51s", "loss_scale": 1.0, "consumed_samples": 148480, "global_step/max_steps": "580/6362"} +{"lm loss": 5.35718203, "grad_norm": 1.31692326, "learning_rate": 9.955e-05, "elapsed_time_per_iteration": 6.52727222, "memory(GiB)": 21.49, "elapsed_time": "1h 3m 35s", "remaining_time": "10h 32m 44s", "loss_scale": 1.0, "consumed_samples": 148736, "global_step/max_steps": "581/6362"} +{"lm loss": 5.36739254, "grad_norm": 1.25794947, "learning_rate": 9.954e-05, "elapsed_time_per_iteration": 6.39560318, "memory(GiB)": 21.49, "elapsed_time": "1h 3m 41s", "remaining_time": "10h 32m 36s", "loss_scale": 1.0, "consumed_samples": 148992, "global_step/max_steps": "582/6362"} +{"lm loss": 5.36027384, "grad_norm": 1.4107728, "learning_rate": 9.954e-05, "elapsed_time_per_iteration": 6.53772616, "memory(GiB)": 21.49, "elapsed_time": "1h 3m 48s", "remaining_time": "10h 32m 29s", "loss_scale": 1.0, "consumed_samples": 149248, "global_step/max_steps": "583/6362"} +{"lm loss": 5.34272957, "grad_norm": 1.19108939, "learning_rate": 9.954e-05, "elapsed_time_per_iteration": 6.77840996, "memory(GiB)": 21.49, "elapsed_time": "1h 3m 55s", "remaining_time": "10h 32m 24s", "loss_scale": 1.0, "consumed_samples": 149504, "global_step/max_steps": "584/6362"} +{"lm loss": 5.37296438, "grad_norm": 1.47162294, "learning_rate": 9.953e-05, "elapsed_time_per_iteration": 6.53650737, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 1s", "remaining_time": "10h 32m 17s", "loss_scale": 1.0, "consumed_samples": 149760, "global_step/max_steps": "585/6362"} +{"lm loss": 5.35599136, "grad_norm": 1.35609102, "learning_rate": 9.953e-05, "elapsed_time_per_iteration": 6.29992938, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 8s", "remaining_time": "10h 32m 8s", "loss_scale": 1.0, "consumed_samples": 150016, "global_step/max_steps": "586/6362"} +{"lm loss": 5.35286474, "grad_norm": 1.38086629, "learning_rate": 9.953e-05, "elapsed_time_per_iteration": 6.43665504, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 14s", "remaining_time": "10h 32m 0s", "loss_scale": 1.0, "consumed_samples": 150272, "global_step/max_steps": "587/6362"} +{"lm loss": 5.37307024, "grad_norm": 1.41835868, "learning_rate": 9.952e-05, "elapsed_time_per_iteration": 6.53768945, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 21s", "remaining_time": "10h 31m 54s", "loss_scale": 1.0, "consumed_samples": 150528, "global_step/max_steps": "588/6362"} +{"lm loss": 5.36996412, "grad_norm": 1.24417496, "learning_rate": 9.952e-05, "elapsed_time_per_iteration": 6.35462284, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 27s", "remaining_time": "10h 31m 45s", "loss_scale": 1.0, "consumed_samples": 150784, "global_step/max_steps": "589/6362"} +{"lm loss": 5.36186981, "grad_norm": 1.22359025, "learning_rate": 9.952e-05, "elapsed_time_per_iteration": 6.50194764, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 33s", "remaining_time": "10h 31m 38s", "loss_scale": 1.0, "consumed_samples": 151040, "global_step/max_steps": "590/6362"} +{"lm loss": 5.35542679, "grad_norm": 1.18883955, "learning_rate": 9.951e-05, "elapsed_time_per_iteration": 6.52883816, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 40s", "remaining_time": "10h 31m 31s", "loss_scale": 1.0, "consumed_samples": 151296, "global_step/max_steps": "591/6362"} +{"lm loss": 5.35849142, "grad_norm": 1.54930401, "learning_rate": 9.951e-05, "elapsed_time_per_iteration": 6.30872488, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 46s", "remaining_time": "10h 31m 22s", "loss_scale": 1.0, "consumed_samples": 151552, "global_step/max_steps": "592/6362"} +{"lm loss": 5.35510159, "grad_norm": 1.37761331, "learning_rate": 9.951e-05, "elapsed_time_per_iteration": 6.34693122, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 53s", "remaining_time": "10h 31m 13s", "loss_scale": 1.0, "consumed_samples": 151808, "global_step/max_steps": "593/6362"} +{"lm loss": 5.33613825, "grad_norm": 1.14953172, "learning_rate": 9.95e-05, "elapsed_time_per_iteration": 6.34080148, "memory(GiB)": 21.49, "elapsed_time": "1h 4m 59s", "remaining_time": "10h 31m 4s", "loss_scale": 1.0, "consumed_samples": 152064, "global_step/max_steps": "594/6362"} +{"lm loss": 5.36201096, "grad_norm": 1.42968214, "learning_rate": 9.95e-05, "elapsed_time_per_iteration": 6.43588138, "memory(GiB)": 21.49, "elapsed_time": "1h 5m 5s", "remaining_time": "10h 30m 57s", "loss_scale": 1.0, "consumed_samples": 152320, "global_step/max_steps": "595/6362"} +{"lm loss": 5.33869886, "grad_norm": 1.30452406, "learning_rate": 9.949e-05, "elapsed_time_per_iteration": 6.45098472, "memory(GiB)": 21.49, "elapsed_time": "1h 5m 12s", "remaining_time": "10h 30m 49s", "loss_scale": 1.0, "consumed_samples": 152576, "global_step/max_steps": "596/6362"} +{"lm loss": 5.35138178, "grad_norm": 1.25069785, "learning_rate": 9.949e-05, "elapsed_time_per_iteration": 6.33259583, "memory(GiB)": 21.49, "elapsed_time": "1h 5m 18s", "remaining_time": "10h 30m 40s", "loss_scale": 1.0, "consumed_samples": 152832, "global_step/max_steps": "597/6362"} +{"lm loss": 5.33835936, "grad_norm": 1.27966237, "learning_rate": 9.949e-05, "elapsed_time_per_iteration": 6.481848, "memory(GiB)": 21.49, "elapsed_time": "1h 5m 25s", "remaining_time": "10h 30m 33s", "loss_scale": 1.0, "consumed_samples": 153088, "global_step/max_steps": "598/6362"} +{"lm loss": 5.35061979, "grad_norm": 1.28799987, "learning_rate": 9.948e-05, "elapsed_time_per_iteration": 6.4806025, "memory(GiB)": 21.49, "elapsed_time": "1h 5m 31s", "remaining_time": "10h 30m 25s", "loss_scale": 1.0, "consumed_samples": 153344, "global_step/max_steps": "599/6362"} +{"lm loss": 5.34873676, "grad_norm": 1.2040416, "learning_rate": 9.948e-05, "elapsed_time_per_iteration": 6.48811197, "memory(GiB)": 21.49, "elapsed_time": "1h 5m 38s", "remaining_time": "10h 30m 18s", "loss_scale": 1.0, "consumed_samples": 153600, "global_step/max_steps": "600/6362"} +{"lm loss": 5.35598993, "grad_norm": 1.46999514, "learning_rate": 9.948e-05, "elapsed_time_per_iteration": 6.56194854, "memory(GiB)": 21.49, "elapsed_time": "1h 5m 44s", "remaining_time": "10h 30m 11s", "loss_scale": 1.0, "consumed_samples": 153856, "global_step/max_steps": "601/6362"} +{"lm loss": 5.37035751, "grad_norm": 1.10829401, "learning_rate": 9.947e-05, "elapsed_time_per_iteration": 6.60124993, "memory(GiB)": 21.49, "elapsed_time": "1h 5m 51s", "remaining_time": "10h 30m 5s", "loss_scale": 1.0, "consumed_samples": 154112, "global_step/max_steps": "602/6362"} +{"lm loss": 5.36077356, "grad_norm": 1.51261318, "learning_rate": 9.947e-05, "elapsed_time_per_iteration": 6.68384647, "memory(GiB)": 21.49, "elapsed_time": "1h 5m 57s", "remaining_time": "10h 30m 0s", "loss_scale": 1.0, "consumed_samples": 154368, "global_step/max_steps": "603/6362"} +{"lm loss": 5.33644009, "grad_norm": 1.17070627, "learning_rate": 9.947e-05, "elapsed_time_per_iteration": 6.45220304, "memory(GiB)": 21.49, "elapsed_time": "1h 6m 4s", "remaining_time": "10h 29m 52s", "loss_scale": 1.0, "consumed_samples": 154624, "global_step/max_steps": "604/6362"} +{"lm loss": 5.35476542, "grad_norm": 1.21843719, "learning_rate": 9.946e-05, "elapsed_time_per_iteration": 6.56035924, "memory(GiB)": 21.49, "elapsed_time": "1h 6m 10s", "remaining_time": "10h 29m 46s", "loss_scale": 1.0, "consumed_samples": 154880, "global_step/max_steps": "605/6362"} +{"lm loss": 5.34195852, "grad_norm": 1.42342198, "learning_rate": 9.946e-05, "elapsed_time_per_iteration": 6.52275181, "memory(GiB)": 21.49, "elapsed_time": "1h 6m 17s", "remaining_time": "10h 29m 39s", "loss_scale": 1.0, "consumed_samples": 155136, "global_step/max_steps": "606/6362"} +{"lm loss": 5.34919214, "grad_norm": 1.1348362, "learning_rate": 9.945e-05, "elapsed_time_per_iteration": 6.38765836, "memory(GiB)": 21.49, "elapsed_time": "1h 6m 23s", "remaining_time": "10h 29m 30s", "loss_scale": 1.0, "consumed_samples": 155392, "global_step/max_steps": "607/6362"} +{"lm loss": 5.36398411, "grad_norm": 1.48909998, "learning_rate": 9.945e-05, "elapsed_time_per_iteration": 6.60616517, "memory(GiB)": 21.49, "elapsed_time": "1h 6m 30s", "remaining_time": "10h 29m 24s", "loss_scale": 1.0, "consumed_samples": 155648, "global_step/max_steps": "608/6362"} +{"lm loss": 5.35380459, "grad_norm": 1.0233072, "learning_rate": 9.945e-05, "elapsed_time_per_iteration": 6.61723709, "memory(GiB)": 21.49, "elapsed_time": "1h 6m 37s", "remaining_time": "10h 29m 18s", "loss_scale": 1.0, "consumed_samples": 155904, "global_step/max_steps": "609/6362"} +{"lm loss": 5.34786367, "grad_norm": 1.48981297, "learning_rate": 9.944e-05, "elapsed_time_per_iteration": 6.93663096, "memory(GiB)": 21.49, "elapsed_time": "1h 6m 43s", "remaining_time": "10h 29m 15s", "loss_scale": 1.0, "consumed_samples": 156160, "global_step/max_steps": "610/6362"} +{"lm loss": 5.3395052, "grad_norm": 1.19357407, "learning_rate": 9.944e-05, "elapsed_time_per_iteration": 6.53686833, "memory(GiB)": 21.49, "elapsed_time": "1h 6m 50s", "remaining_time": "10h 29m 8s", "loss_scale": 1.0, "consumed_samples": 156416, "global_step/max_steps": "611/6362"} +{"lm loss": 5.35251617, "grad_norm": 1.50807619, "learning_rate": 9.944e-05, "elapsed_time_per_iteration": 6.31276202, "memory(GiB)": 21.49, "elapsed_time": "1h 6m 56s", "remaining_time": "10h 28m 59s", "loss_scale": 1.0, "consumed_samples": 156672, "global_step/max_steps": "612/6362"} +{"lm loss": 5.34974289, "grad_norm": 1.26602387, "learning_rate": 9.943e-05, "elapsed_time_per_iteration": 6.6891315, "memory(GiB)": 21.49, "elapsed_time": "1h 7m 3s", "remaining_time": "10h 28m 54s", "loss_scale": 1.0, "consumed_samples": 156928, "global_step/max_steps": "613/6362"} +{"lm loss": 5.31030321, "grad_norm": 1.30783403, "learning_rate": 9.943e-05, "elapsed_time_per_iteration": 6.50549102, "memory(GiB)": 21.49, "elapsed_time": "1h 7m 10s", "remaining_time": "10h 28m 47s", "loss_scale": 1.0, "consumed_samples": 157184, "global_step/max_steps": "614/6362"} +{"lm loss": 5.32875729, "grad_norm": 1.25003099, "learning_rate": 9.942e-05, "elapsed_time_per_iteration": 6.44137239, "memory(GiB)": 21.49, "elapsed_time": "1h 7m 16s", "remaining_time": "10h 28m 39s", "loss_scale": 1.0, "consumed_samples": 157440, "global_step/max_steps": "615/6362"} +{"lm loss": 5.32363939, "grad_norm": 1.48339403, "learning_rate": 9.942e-05, "elapsed_time_per_iteration": 6.69078445, "memory(GiB)": 21.49, "elapsed_time": "1h 7m 23s", "remaining_time": "10h 28m 34s", "loss_scale": 1.0, "consumed_samples": 157696, "global_step/max_steps": "616/6362"} +{"lm loss": 5.33832407, "grad_norm": 1.38557112, "learning_rate": 9.942e-05, "elapsed_time_per_iteration": 6.66945386, "memory(GiB)": 21.49, "elapsed_time": "1h 7m 29s", "remaining_time": "10h 28m 28s", "loss_scale": 1.0, "consumed_samples": 157952, "global_step/max_steps": "617/6362"} +{"lm loss": 5.35133457, "grad_norm": 1.09129143, "learning_rate": 9.941e-05, "elapsed_time_per_iteration": 6.56209993, "memory(GiB)": 21.49, "elapsed_time": "1h 7m 36s", "remaining_time": "10h 28m 22s", "loss_scale": 1.0, "consumed_samples": 158208, "global_step/max_steps": "618/6362"} +{"lm loss": 5.32198715, "grad_norm": 1.38075352, "learning_rate": 9.941e-05, "elapsed_time_per_iteration": 6.38473296, "memory(GiB)": 21.49, "elapsed_time": "1h 7m 42s", "remaining_time": "10h 28m 14s", "loss_scale": 1.0, "consumed_samples": 158464, "global_step/max_steps": "619/6362"} +{"lm loss": 5.32307196, "grad_norm": 1.24166775, "learning_rate": 9.94e-05, "elapsed_time_per_iteration": 6.55251431, "memory(GiB)": 21.49, "elapsed_time": "1h 7m 49s", "remaining_time": "10h 28m 7s", "loss_scale": 1.0, "consumed_samples": 158720, "global_step/max_steps": "620/6362"} +{"lm loss": 5.31882095, "grad_norm": 1.46092522, "learning_rate": 9.94e-05, "elapsed_time_per_iteration": 6.4419589, "memory(GiB)": 21.49, "elapsed_time": "1h 7m 55s", "remaining_time": "10h 27m 59s", "loss_scale": 1.0, "consumed_samples": 158976, "global_step/max_steps": "621/6362"} +{"lm loss": 5.33448076, "grad_norm": 1.33382273, "learning_rate": 9.94e-05, "elapsed_time_per_iteration": 6.58652782, "memory(GiB)": 21.49, "elapsed_time": "1h 8m 2s", "remaining_time": "10h 27m 53s", "loss_scale": 1.0, "consumed_samples": 159232, "global_step/max_steps": "622/6362"} +{"lm loss": 5.33760595, "grad_norm": 1.3678304, "learning_rate": 9.939e-05, "elapsed_time_per_iteration": 6.56759977, "memory(GiB)": 21.49, "elapsed_time": "1h 8m 8s", "remaining_time": "10h 27m 46s", "loss_scale": 1.0, "consumed_samples": 159488, "global_step/max_steps": "623/6362"} +{"lm loss": 5.34563494, "grad_norm": 1.21748221, "learning_rate": 9.939e-05, "elapsed_time_per_iteration": 6.54595947, "memory(GiB)": 21.49, "elapsed_time": "1h 8m 15s", "remaining_time": "10h 27m 40s", "loss_scale": 1.0, "consumed_samples": 159744, "global_step/max_steps": "624/6362"} +{"lm loss": 5.33249998, "grad_norm": 1.3426317, "learning_rate": 9.938e-05, "elapsed_time_per_iteration": 6.71912932, "memory(GiB)": 21.49, "elapsed_time": "1h 8m 22s", "remaining_time": "10h 27m 34s", "loss_scale": 1.0, "consumed_samples": 160000, "global_step/max_steps": "625/6362"} +{"lm loss": 5.33991575, "grad_norm": 1.24307835, "learning_rate": 9.938e-05, "elapsed_time_per_iteration": 6.57768846, "memory(GiB)": 21.49, "elapsed_time": "1h 8m 28s", "remaining_time": "10h 27m 28s", "loss_scale": 1.0, "consumed_samples": 160256, "global_step/max_steps": "626/6362"} +{"lm loss": 5.33397341, "grad_norm": 1.30795169, "learning_rate": 9.938e-05, "elapsed_time_per_iteration": 6.50950408, "memory(GiB)": 21.49, "elapsed_time": "1h 8m 35s", "remaining_time": "10h 27m 21s", "loss_scale": 1.0, "consumed_samples": 160512, "global_step/max_steps": "627/6362"} +{"lm loss": 5.31854963, "grad_norm": 1.20730495, "learning_rate": 9.937e-05, "elapsed_time_per_iteration": 6.66331673, "memory(GiB)": 21.49, "elapsed_time": "1h 8m 41s", "remaining_time": "10h 27m 15s", "loss_scale": 1.0, "consumed_samples": 160768, "global_step/max_steps": "628/6362"} +{"lm loss": 5.32187796, "grad_norm": 1.1491046, "learning_rate": 9.937e-05, "elapsed_time_per_iteration": 6.54149818, "memory(GiB)": 21.49, "elapsed_time": "1h 8m 48s", "remaining_time": "10h 27m 9s", "loss_scale": 1.0, "consumed_samples": 161024, "global_step/max_steps": "629/6362"} +{"lm loss": 5.34773588, "grad_norm": 1.33061433, "learning_rate": 9.936e-05, "elapsed_time_per_iteration": 6.56211114, "memory(GiB)": 21.49, "elapsed_time": "1h 8m 55s", "remaining_time": "10h 27m 2s", "loss_scale": 1.0, "consumed_samples": 161280, "global_step/max_steps": "630/6362"} +{"lm loss": 5.34618187, "grad_norm": 1.27876794, "learning_rate": 9.936e-05, "elapsed_time_per_iteration": 6.48323011, "memory(GiB)": 21.49, "elapsed_time": "1h 9m 1s", "remaining_time": "10h 26m 55s", "loss_scale": 1.0, "consumed_samples": 161536, "global_step/max_steps": "631/6362"} +{"lm loss": 5.33354425, "grad_norm": 1.21522021, "learning_rate": 9.936e-05, "elapsed_time_per_iteration": 6.48523617, "memory(GiB)": 21.49, "elapsed_time": "1h 9m 8s", "remaining_time": "10h 26m 47s", "loss_scale": 1.0, "consumed_samples": 161792, "global_step/max_steps": "632/6362"} +{"lm loss": 5.33338499, "grad_norm": 1.21628988, "learning_rate": 9.935e-05, "elapsed_time_per_iteration": 6.47594261, "memory(GiB)": 21.49, "elapsed_time": "1h 9m 14s", "remaining_time": "10h 26m 40s", "loss_scale": 1.0, "consumed_samples": 162048, "global_step/max_steps": "633/6362"} +{"lm loss": 5.33305931, "grad_norm": 1.3090142, "learning_rate": 9.935e-05, "elapsed_time_per_iteration": 6.42644525, "memory(GiB)": 21.49, "elapsed_time": "1h 9m 20s", "remaining_time": "10h 26m 32s", "loss_scale": 1.0, "consumed_samples": 162304, "global_step/max_steps": "634/6362"} +{"lm loss": 5.33117914, "grad_norm": 1.324067, "learning_rate": 9.934e-05, "elapsed_time_per_iteration": 6.31376839, "memory(GiB)": 21.49, "elapsed_time": "1h 9m 27s", "remaining_time": "10h 26m 23s", "loss_scale": 1.0, "consumed_samples": 162560, "global_step/max_steps": "635/6362"} +{"lm loss": 5.33012962, "grad_norm": 1.20282638, "learning_rate": 9.934e-05, "elapsed_time_per_iteration": 6.55228257, "memory(GiB)": 21.49, "elapsed_time": "1h 9m 33s", "remaining_time": "10h 26m 17s", "loss_scale": 1.0, "consumed_samples": 162816, "global_step/max_steps": "636/6362"} +{"lm loss": 5.31843758, "grad_norm": 1.30182874, "learning_rate": 9.934e-05, "elapsed_time_per_iteration": 6.82157326, "memory(GiB)": 21.49, "elapsed_time": "1h 9m 40s", "remaining_time": "10h 26m 13s", "loss_scale": 1.0, "consumed_samples": 163072, "global_step/max_steps": "637/6362"} +{"lm loss": 5.31930399, "grad_norm": 1.18718469, "learning_rate": 9.933e-05, "elapsed_time_per_iteration": 6.54400682, "memory(GiB)": 21.49, "elapsed_time": "1h 9m 47s", "remaining_time": "10h 26m 6s", "loss_scale": 1.0, "consumed_samples": 163328, "global_step/max_steps": "638/6362"} +{"lm loss": 5.32805586, "grad_norm": 1.47797632, "learning_rate": 9.933e-05, "elapsed_time_per_iteration": 6.66260958, "memory(GiB)": 21.49, "elapsed_time": "1h 9m 53s", "remaining_time": "10h 26m 0s", "loss_scale": 1.0, "consumed_samples": 163584, "global_step/max_steps": "639/6362"} +{"lm loss": 5.32602501, "grad_norm": 1.13558841, "learning_rate": 9.932e-05, "elapsed_time_per_iteration": 6.42718983, "memory(GiB)": 21.49, "elapsed_time": "1h 10m 0s", "remaining_time": "10h 25m 52s", "loss_scale": 1.0, "consumed_samples": 163840, "global_step/max_steps": "640/6362"} +{"lm loss": 5.33348703, "grad_norm": 1.19248426, "learning_rate": 9.932e-05, "elapsed_time_per_iteration": 6.55943131, "memory(GiB)": 21.49, "elapsed_time": "1h 10m 6s", "remaining_time": "10h 25m 46s", "loss_scale": 1.0, "consumed_samples": 164096, "global_step/max_steps": "641/6362"} +{"lm loss": 5.33500051, "grad_norm": 1.25984156, "learning_rate": 9.931e-05, "elapsed_time_per_iteration": 6.47174048, "memory(GiB)": 21.49, "elapsed_time": "1h 10m 13s", "remaining_time": "10h 25m 38s", "loss_scale": 1.0, "consumed_samples": 164352, "global_step/max_steps": "642/6362"} +{"lm loss": 5.31057358, "grad_norm": 1.41496682, "learning_rate": 9.931e-05, "elapsed_time_per_iteration": 6.50517154, "memory(GiB)": 21.49, "elapsed_time": "1h 10m 19s", "remaining_time": "10h 25m 31s", "loss_scale": 1.0, "consumed_samples": 164608, "global_step/max_steps": "643/6362"} +{"lm loss": 5.31142855, "grad_norm": 1.06497717, "learning_rate": 9.931e-05, "elapsed_time_per_iteration": 6.48680115, "memory(GiB)": 21.49, "elapsed_time": "1h 10m 26s", "remaining_time": "10h 25m 24s", "loss_scale": 1.0, "consumed_samples": 164864, "global_step/max_steps": "644/6362"} +{"lm loss": 5.3304472, "grad_norm": 1.61061621, "learning_rate": 9.93e-05, "elapsed_time_per_iteration": 6.62026262, "memory(GiB)": 21.49, "elapsed_time": "1h 10m 32s", "remaining_time": "10h 25m 18s", "loss_scale": 1.0, "consumed_samples": 165120, "global_step/max_steps": "645/6362"} +{"lm loss": 5.33548355, "grad_norm": 1.2214911, "learning_rate": 9.93e-05, "elapsed_time_per_iteration": 6.5488174, "memory(GiB)": 21.49, "elapsed_time": "1h 10m 39s", "remaining_time": "10h 25m 11s", "loss_scale": 1.0, "consumed_samples": 165376, "global_step/max_steps": "646/6362"} +{"lm loss": 5.31400871, "grad_norm": 1.49997437, "learning_rate": 9.929e-05, "elapsed_time_per_iteration": 6.59247041, "memory(GiB)": 21.49, "elapsed_time": "1h 10m 46s", "remaining_time": "10h 25m 5s", "loss_scale": 1.0, "consumed_samples": 165632, "global_step/max_steps": "647/6362"} +{"lm loss": 5.32126713, "grad_norm": 1.26262188, "learning_rate": 9.929e-05, "elapsed_time_per_iteration": 6.54736972, "memory(GiB)": 21.51, "elapsed_time": "1h 10m 52s", "remaining_time": "10h 24m 58s", "loss_scale": 1.0, "consumed_samples": 165888, "global_step/max_steps": "648/6362"} +{"lm loss": 5.33778095, "grad_norm": 1.44010568, "learning_rate": 9.928e-05, "elapsed_time_per_iteration": 6.63989687, "memory(GiB)": 21.51, "elapsed_time": "1h 10m 59s", "remaining_time": "10h 24m 52s", "loss_scale": 1.0, "consumed_samples": 166144, "global_step/max_steps": "649/6362"} +{"lm loss": 5.33770847, "grad_norm": 1.17342472, "learning_rate": 9.928e-05, "elapsed_time_per_iteration": 6.57486153, "memory(GiB)": 21.51, "elapsed_time": "1h 11m 5s", "remaining_time": "10h 24m 46s", "loss_scale": 1.0, "consumed_samples": 166400, "global_step/max_steps": "650/6362"} +{"lm loss": 5.32047415, "grad_norm": 1.2754432, "learning_rate": 9.928e-05, "elapsed_time_per_iteration": 6.58874965, "memory(GiB)": 21.51, "elapsed_time": "1h 11m 12s", "remaining_time": "10h 24m 40s", "loss_scale": 1.0, "consumed_samples": 166656, "global_step/max_steps": "651/6362"} +{"lm loss": 5.3172226, "grad_norm": 1.1865108, "learning_rate": 9.927e-05, "elapsed_time_per_iteration": 6.77785134, "memory(GiB)": 21.51, "elapsed_time": "1h 11m 19s", "remaining_time": "10h 24m 35s", "loss_scale": 1.0, "consumed_samples": 166912, "global_step/max_steps": "652/6362"} +{"lm loss": 5.30517673, "grad_norm": 1.51991761, "learning_rate": 9.927e-05, "elapsed_time_per_iteration": 6.74003267, "memory(GiB)": 21.51, "elapsed_time": "1h 11m 25s", "remaining_time": "10h 24m 30s", "loss_scale": 1.0, "consumed_samples": 167168, "global_step/max_steps": "653/6362"} +{"lm loss": 5.32003736, "grad_norm": 1.23318827, "learning_rate": 9.926e-05, "elapsed_time_per_iteration": 6.67472601, "memory(GiB)": 21.51, "elapsed_time": "1h 11m 32s", "remaining_time": "10h 24m 24s", "loss_scale": 1.0, "consumed_samples": 167424, "global_step/max_steps": "654/6362"} +{"lm loss": 5.32757664, "grad_norm": 1.27999902, "learning_rate": 9.926e-05, "elapsed_time_per_iteration": 6.39650035, "memory(GiB)": 21.51, "elapsed_time": "1h 11m 38s", "remaining_time": "10h 24m 16s", "loss_scale": 1.0, "consumed_samples": 167680, "global_step/max_steps": "655/6362"} +{"lm loss": 5.3119607, "grad_norm": 1.23298585, "learning_rate": 9.925e-05, "elapsed_time_per_iteration": 6.52190638, "memory(GiB)": 21.51, "elapsed_time": "1h 11m 45s", "remaining_time": "10h 24m 9s", "loss_scale": 1.0, "consumed_samples": 167936, "global_step/max_steps": "656/6362"} +{"lm loss": 5.32336617, "grad_norm": 1.18526566, "learning_rate": 9.925e-05, "elapsed_time_per_iteration": 6.52224779, "memory(GiB)": 21.51, "elapsed_time": "1h 11m 52s", "remaining_time": "10h 24m 3s", "loss_scale": 1.0, "consumed_samples": 168192, "global_step/max_steps": "657/6362"} +{"lm loss": 5.31578445, "grad_norm": 1.10286772, "learning_rate": 9.924e-05, "elapsed_time_per_iteration": 6.62377977, "memory(GiB)": 21.51, "elapsed_time": "1h 11m 58s", "remaining_time": "10h 23m 56s", "loss_scale": 1.0, "consumed_samples": 168448, "global_step/max_steps": "658/6362"} +{"lm loss": 5.31268311, "grad_norm": 1.30221128, "learning_rate": 9.924e-05, "elapsed_time_per_iteration": 6.53836632, "memory(GiB)": 21.51, "elapsed_time": "1h 12m 5s", "remaining_time": "10h 23m 50s", "loss_scale": 1.0, "consumed_samples": 168704, "global_step/max_steps": "659/6362"} +{"lm loss": 5.32123423, "grad_norm": 1.27956975, "learning_rate": 9.924e-05, "elapsed_time_per_iteration": 6.52701592, "memory(GiB)": 21.51, "elapsed_time": "1h 12m 11s", "remaining_time": "10h 23m 43s", "loss_scale": 1.0, "consumed_samples": 168960, "global_step/max_steps": "660/6362"} +{"lm loss": 5.33724403, "grad_norm": 1.27604973, "learning_rate": 9.923e-05, "elapsed_time_per_iteration": 6.59957576, "memory(GiB)": 21.51, "elapsed_time": "1h 12m 18s", "remaining_time": "10h 23m 37s", "loss_scale": 1.0, "consumed_samples": 169216, "global_step/max_steps": "661/6362"} +{"lm loss": 5.31979942, "grad_norm": 1.24590528, "learning_rate": 9.923e-05, "elapsed_time_per_iteration": 6.46887612, "memory(GiB)": 21.51, "elapsed_time": "1h 12m 24s", "remaining_time": "10h 23m 29s", "loss_scale": 1.0, "consumed_samples": 169472, "global_step/max_steps": "662/6362"} +{"lm loss": 5.32341957, "grad_norm": 1.29696298, "learning_rate": 9.922e-05, "elapsed_time_per_iteration": 6.58464742, "memory(GiB)": 21.51, "elapsed_time": "1h 12m 31s", "remaining_time": "10h 23m 23s", "loss_scale": 1.0, "consumed_samples": 169728, "global_step/max_steps": "663/6362"} +{"lm loss": 5.32228136, "grad_norm": 1.28654575, "learning_rate": 9.922e-05, "elapsed_time_per_iteration": 6.53412199, "memory(GiB)": 21.51, "elapsed_time": "1h 12m 37s", "remaining_time": "10h 23m 16s", "loss_scale": 1.0, "consumed_samples": 169984, "global_step/max_steps": "664/6362"} +{"lm loss": 5.29519367, "grad_norm": 1.51018405, "learning_rate": 9.921e-05, "elapsed_time_per_iteration": 6.50371242, "memory(GiB)": 21.51, "elapsed_time": "1h 12m 44s", "remaining_time": "10h 23m 9s", "loss_scale": 1.0, "consumed_samples": 170240, "global_step/max_steps": "665/6362"} +{"lm loss": 5.31990719, "grad_norm": 1.02601576, "learning_rate": 9.921e-05, "elapsed_time_per_iteration": 6.58974433, "memory(GiB)": 21.51, "elapsed_time": "1h 12m 50s", "remaining_time": "10h 23m 3s", "loss_scale": 1.0, "consumed_samples": 170496, "global_step/max_steps": "666/6362"} +{"lm loss": 5.31965065, "grad_norm": 1.45274746, "learning_rate": 9.92e-05, "elapsed_time_per_iteration": 7.66017747, "memory(GiB)": 21.51, "elapsed_time": "1h 12m 58s", "remaining_time": "10h 23m 5s", "loss_scale": 1.0, "consumed_samples": 170752, "global_step/max_steps": "667/6362"} +{"lm loss": 5.32974434, "grad_norm": 1.08744061, "learning_rate": 9.92e-05, "elapsed_time_per_iteration": 6.63581491, "memory(GiB)": 21.51, "elapsed_time": "1h 13m 5s", "remaining_time": "10h 22m 59s", "loss_scale": 1.0, "consumed_samples": 171008, "global_step/max_steps": "668/6362"} +{"lm loss": 5.31816483, "grad_norm": 1.47918248, "learning_rate": 9.92e-05, "elapsed_time_per_iteration": 6.41756296, "memory(GiB)": 21.51, "elapsed_time": "1h 13m 11s", "remaining_time": "10h 22m 52s", "loss_scale": 1.0, "consumed_samples": 171264, "global_step/max_steps": "669/6362"} +{"lm loss": 5.32127237, "grad_norm": 1.01955318, "learning_rate": 9.919e-05, "elapsed_time_per_iteration": 6.68093991, "memory(GiB)": 21.51, "elapsed_time": "1h 13m 18s", "remaining_time": "10h 22m 46s", "loss_scale": 1.0, "consumed_samples": 171520, "global_step/max_steps": "670/6362"} +{"lm loss": 5.32604933, "grad_norm": 1.681036, "learning_rate": 9.919e-05, "elapsed_time_per_iteration": 6.50430608, "memory(GiB)": 21.51, "elapsed_time": "1h 13m 24s", "remaining_time": "10h 22m 39s", "loss_scale": 1.0, "consumed_samples": 171776, "global_step/max_steps": "671/6362"} +{"lm loss": 5.32840252, "grad_norm": 0.99458444, "learning_rate": 9.918e-05, "elapsed_time_per_iteration": 6.67681384, "memory(GiB)": 21.51, "elapsed_time": "1h 13m 31s", "remaining_time": "10h 22m 33s", "loss_scale": 1.0, "consumed_samples": 172032, "global_step/max_steps": "672/6362"} +{"lm loss": 5.31223297, "grad_norm": 1.20840406, "learning_rate": 9.918e-05, "elapsed_time_per_iteration": 6.50203323, "memory(GiB)": 21.51, "elapsed_time": "1h 13m 38s", "remaining_time": "10h 22m 26s", "loss_scale": 1.0, "consumed_samples": 172288, "global_step/max_steps": "673/6362"} +{"lm loss": 5.32691288, "grad_norm": 1.2544657, "learning_rate": 9.917e-05, "elapsed_time_per_iteration": 6.56714439, "memory(GiB)": 21.51, "elapsed_time": "1h 13m 44s", "remaining_time": "10h 22m 20s", "loss_scale": 1.0, "consumed_samples": 172544, "global_step/max_steps": "674/6362"} +{"lm loss": 5.32746363, "grad_norm": 1.33075869, "learning_rate": 9.917e-05, "elapsed_time_per_iteration": 6.6600256, "memory(GiB)": 21.51, "elapsed_time": "1h 13m 51s", "remaining_time": "10h 22m 14s", "loss_scale": 1.0, "consumed_samples": 172800, "global_step/max_steps": "675/6362"} +{"lm loss": 5.28390455, "grad_norm": 1.41315997, "learning_rate": 9.916e-05, "elapsed_time_per_iteration": 6.43092346, "memory(GiB)": 21.51, "elapsed_time": "1h 13m 57s", "remaining_time": "10h 22m 6s", "loss_scale": 1.0, "consumed_samples": 173056, "global_step/max_steps": "676/6362"} +{"lm loss": 5.30310059, "grad_norm": 1.00813055, "learning_rate": 9.916e-05, "elapsed_time_per_iteration": 6.5349586, "memory(GiB)": 21.51, "elapsed_time": "1h 14m 4s", "remaining_time": "10h 21m 59s", "loss_scale": 1.0, "consumed_samples": 173312, "global_step/max_steps": "677/6362"} +{"lm loss": 5.31445503, "grad_norm": 1.31974459, "learning_rate": 9.915e-05, "elapsed_time_per_iteration": 6.55322123, "memory(GiB)": 21.51, "elapsed_time": "1h 14m 10s", "remaining_time": "10h 21m 53s", "loss_scale": 1.0, "consumed_samples": 173568, "global_step/max_steps": "678/6362"} +{"lm loss": 5.2970233, "grad_norm": 1.02663517, "learning_rate": 9.915e-05, "elapsed_time_per_iteration": 6.40615201, "memory(GiB)": 21.51, "elapsed_time": "1h 14m 17s", "remaining_time": "10h 21m 45s", "loss_scale": 1.0, "consumed_samples": 173824, "global_step/max_steps": "679/6362"} +{"lm loss": 5.29963398, "grad_norm": 1.12574601, "learning_rate": 9.914e-05, "elapsed_time_per_iteration": 6.60020375, "memory(GiB)": 21.51, "elapsed_time": "1h 14m 23s", "remaining_time": "10h 21m 39s", "loss_scale": 1.0, "consumed_samples": 174080, "global_step/max_steps": "680/6362"} +{"lm loss": 5.29648447, "grad_norm": 1.10249329, "learning_rate": 9.914e-05, "elapsed_time_per_iteration": 6.6826148, "memory(GiB)": 21.51, "elapsed_time": "1h 14m 30s", "remaining_time": "10h 21m 33s", "loss_scale": 1.0, "consumed_samples": 174336, "global_step/max_steps": "681/6362"} +{"lm loss": 5.32015228, "grad_norm": 1.38388717, "learning_rate": 9.913e-05, "elapsed_time_per_iteration": 6.52886701, "memory(GiB)": 21.51, "elapsed_time": "1h 14m 37s", "remaining_time": "10h 21m 26s", "loss_scale": 1.0, "consumed_samples": 174592, "global_step/max_steps": "682/6362"} +{"lm loss": 5.28247643, "grad_norm": 1.39550233, "learning_rate": 9.913e-05, "elapsed_time_per_iteration": 6.42455578, "memory(GiB)": 21.51, "elapsed_time": "1h 14m 43s", "remaining_time": "10h 21m 18s", "loss_scale": 1.0, "consumed_samples": 174848, "global_step/max_steps": "683/6362"} +{"lm loss": 5.30857468, "grad_norm": 1.19042778, "learning_rate": 9.913e-05, "elapsed_time_per_iteration": 6.48986149, "memory(GiB)": 21.51, "elapsed_time": "1h 14m 49s", "remaining_time": "10h 21m 11s", "loss_scale": 1.0, "consumed_samples": 175104, "global_step/max_steps": "684/6362"} +{"lm loss": 5.30535746, "grad_norm": 1.2212739, "learning_rate": 9.912e-05, "elapsed_time_per_iteration": 6.51825166, "memory(GiB)": 21.51, "elapsed_time": "1h 14m 56s", "remaining_time": "10h 21m 4s", "loss_scale": 1.0, "consumed_samples": 175360, "global_step/max_steps": "685/6362"} +{"lm loss": 5.305233, "grad_norm": 1.15443313, "learning_rate": 9.912e-05, "elapsed_time_per_iteration": 6.70277858, "memory(GiB)": 21.51, "elapsed_time": "1h 15m 3s", "remaining_time": "10h 20m 59s", "loss_scale": 1.0, "consumed_samples": 175616, "global_step/max_steps": "686/6362"} +{"lm loss": 5.30393887, "grad_norm": 1.23635209, "learning_rate": 9.911e-05, "elapsed_time_per_iteration": 6.42176723, "memory(GiB)": 21.51, "elapsed_time": "1h 15m 9s", "remaining_time": "10h 20m 51s", "loss_scale": 1.0, "consumed_samples": 175872, "global_step/max_steps": "687/6362"} +{"lm loss": 5.30914593, "grad_norm": 1.12311912, "learning_rate": 9.911e-05, "elapsed_time_per_iteration": 6.67958665, "memory(GiB)": 21.51, "elapsed_time": "1h 15m 16s", "remaining_time": "10h 20m 46s", "loss_scale": 1.0, "consumed_samples": 176128, "global_step/max_steps": "688/6362"} +{"lm loss": 5.28404474, "grad_norm": 1.27075315, "learning_rate": 9.91e-05, "elapsed_time_per_iteration": 6.52262306, "memory(GiB)": 21.51, "elapsed_time": "1h 15m 22s", "remaining_time": "10h 20m 39s", "loss_scale": 1.0, "consumed_samples": 176384, "global_step/max_steps": "689/6362"} +{"lm loss": 5.30387783, "grad_norm": 1.32666612, "learning_rate": 9.91e-05, "elapsed_time_per_iteration": 6.51402283, "memory(GiB)": 21.51, "elapsed_time": "1h 15m 29s", "remaining_time": "10h 20m 32s", "loss_scale": 1.0, "consumed_samples": 176640, "global_step/max_steps": "690/6362"} +{"lm loss": 5.29113674, "grad_norm": 1.27474165, "learning_rate": 9.909e-05, "elapsed_time_per_iteration": 6.750036, "memory(GiB)": 21.51, "elapsed_time": "1h 15m 36s", "remaining_time": "10h 20m 27s", "loss_scale": 1.0, "consumed_samples": 176896, "global_step/max_steps": "691/6362"} +{"lm loss": 5.30160761, "grad_norm": 1.07345986, "learning_rate": 9.909e-05, "elapsed_time_per_iteration": 6.6028316, "memory(GiB)": 21.51, "elapsed_time": "1h 15m 42s", "remaining_time": "10h 20m 20s", "loss_scale": 1.0, "consumed_samples": 177152, "global_step/max_steps": "692/6362"} +{"lm loss": 5.314888, "grad_norm": 1.23826349, "learning_rate": 9.908e-05, "elapsed_time_per_iteration": 6.45632982, "memory(GiB)": 21.51, "elapsed_time": "1h 15m 49s", "remaining_time": "10h 20m 13s", "loss_scale": 1.0, "consumed_samples": 177408, "global_step/max_steps": "693/6362"} +{"lm loss": 5.30123901, "grad_norm": 1.16842914, "learning_rate": 9.908e-05, "elapsed_time_per_iteration": 6.614079, "memory(GiB)": 21.51, "elapsed_time": "1h 15m 55s", "remaining_time": "10h 20m 7s", "loss_scale": 1.0, "consumed_samples": 177664, "global_step/max_steps": "694/6362"} +{"lm loss": 5.31038857, "grad_norm": 1.28438604, "learning_rate": 9.907e-05, "elapsed_time_per_iteration": 6.5031321, "memory(GiB)": 21.51, "elapsed_time": "1h 16m 2s", "remaining_time": "10h 20m 0s", "loss_scale": 1.0, "consumed_samples": 177920, "global_step/max_steps": "695/6362"} +{"lm loss": 5.30772877, "grad_norm": 1.06513906, "learning_rate": 9.907e-05, "elapsed_time_per_iteration": 6.45978165, "memory(GiB)": 21.51, "elapsed_time": "1h 16m 8s", "remaining_time": "10h 19m 52s", "loss_scale": 1.0, "consumed_samples": 178176, "global_step/max_steps": "696/6362"} +{"lm loss": 5.29405212, "grad_norm": 1.42804337, "learning_rate": 9.906e-05, "elapsed_time_per_iteration": 6.39178634, "memory(GiB)": 21.51, "elapsed_time": "1h 16m 15s", "remaining_time": "10h 19m 44s", "loss_scale": 1.0, "consumed_samples": 178432, "global_step/max_steps": "697/6362"} +{"lm loss": 5.31394482, "grad_norm": 1.16421664, "learning_rate": 9.906e-05, "elapsed_time_per_iteration": 6.34089231, "memory(GiB)": 21.51, "elapsed_time": "1h 16m 21s", "remaining_time": "10h 19m 36s", "loss_scale": 1.0, "consumed_samples": 178688, "global_step/max_steps": "698/6362"} +{"lm loss": 5.30387306, "grad_norm": 1.0977546, "learning_rate": 9.905e-05, "elapsed_time_per_iteration": 6.46242285, "memory(GiB)": 21.51, "elapsed_time": "1h 16m 27s", "remaining_time": "10h 19m 29s", "loss_scale": 1.0, "consumed_samples": 178944, "global_step/max_steps": "699/6362"} +{"lm loss": 5.30111265, "grad_norm": 1.57592249, "learning_rate": 9.905e-05, "elapsed_time_per_iteration": 6.47446227, "memory(GiB)": 21.51, "elapsed_time": "1h 16m 34s", "remaining_time": "10h 19m 21s", "loss_scale": 1.0, "consumed_samples": 179200, "global_step/max_steps": "700/6362"} +{"lm loss": 5.31650209, "grad_norm": 0.98119247, "learning_rate": 9.904e-05, "elapsed_time_per_iteration": 6.43927336, "memory(GiB)": 21.51, "elapsed_time": "1h 16m 40s", "remaining_time": "10h 19m 14s", "loss_scale": 1.0, "consumed_samples": 179456, "global_step/max_steps": "701/6362"} +{"lm loss": 5.29730177, "grad_norm": 1.30231118, "learning_rate": 9.904e-05, "elapsed_time_per_iteration": 6.5429213, "memory(GiB)": 21.51, "elapsed_time": "1h 16m 47s", "remaining_time": "10h 19m 7s", "loss_scale": 1.0, "consumed_samples": 179712, "global_step/max_steps": "702/6362"} +{"lm loss": 5.29930401, "grad_norm": 1.22239709, "learning_rate": 9.903e-05, "elapsed_time_per_iteration": 6.59795117, "memory(GiB)": 21.51, "elapsed_time": "1h 16m 53s", "remaining_time": "10h 19m 1s", "loss_scale": 1.0, "consumed_samples": 179968, "global_step/max_steps": "703/6362"} +{"lm loss": 5.29442549, "grad_norm": 1.3107326, "learning_rate": 9.903e-05, "elapsed_time_per_iteration": 6.49246407, "memory(GiB)": 21.51, "elapsed_time": "1h 17m 0s", "remaining_time": "10h 18m 54s", "loss_scale": 1.0, "consumed_samples": 180224, "global_step/max_steps": "704/6362"} +{"lm loss": 5.30940342, "grad_norm": 0.94555318, "learning_rate": 9.902e-05, "elapsed_time_per_iteration": 6.64911413, "memory(GiB)": 21.51, "elapsed_time": "1h 17m 7s", "remaining_time": "10h 18m 48s", "loss_scale": 1.0, "consumed_samples": 180480, "global_step/max_steps": "705/6362"} +{"lm loss": 5.30249691, "grad_norm": 1.39201128, "learning_rate": 9.902e-05, "elapsed_time_per_iteration": 6.58078027, "memory(GiB)": 21.51, "elapsed_time": "1h 17m 13s", "remaining_time": "10h 18m 41s", "loss_scale": 1.0, "consumed_samples": 180736, "global_step/max_steps": "706/6362"} +{"lm loss": 5.31022358, "grad_norm": 1.26277506, "learning_rate": 9.901e-05, "elapsed_time_per_iteration": 6.50953674, "memory(GiB)": 21.51, "elapsed_time": "1h 17m 20s", "remaining_time": "10h 18m 34s", "loss_scale": 1.0, "consumed_samples": 180992, "global_step/max_steps": "707/6362"} +{"lm loss": 5.30214977, "grad_norm": 1.31311989, "learning_rate": 9.901e-05, "elapsed_time_per_iteration": 6.73555446, "memory(GiB)": 21.51, "elapsed_time": "1h 17m 26s", "remaining_time": "10h 18m 29s", "loss_scale": 1.0, "consumed_samples": 181248, "global_step/max_steps": "708/6362"} +{"lm loss": 5.28481388, "grad_norm": 1.15354121, "learning_rate": 9.9e-05, "elapsed_time_per_iteration": 6.53184438, "memory(GiB)": 21.51, "elapsed_time": "1h 17m 33s", "remaining_time": "10h 18m 22s", "loss_scale": 1.0, "consumed_samples": 181504, "global_step/max_steps": "709/6362"} +{"lm loss": 5.27111292, "grad_norm": 1.49980581, "learning_rate": 9.9e-05, "elapsed_time_per_iteration": 6.82230115, "memory(GiB)": 21.51, "elapsed_time": "1h 17m 40s", "remaining_time": "10h 18m 18s", "loss_scale": 1.0, "consumed_samples": 181760, "global_step/max_steps": "710/6362"} +{"lm loss": 5.28815651, "grad_norm": 1.26711655, "learning_rate": 9.899e-05, "elapsed_time_per_iteration": 6.411062, "memory(GiB)": 21.51, "elapsed_time": "1h 17m 46s", "remaining_time": "10h 18m 10s", "loss_scale": 1.0, "consumed_samples": 182016, "global_step/max_steps": "711/6362"} +{"lm loss": 5.29530716, "grad_norm": 1.1452117, "learning_rate": 9.899e-05, "elapsed_time_per_iteration": 6.50934029, "memory(GiB)": 21.51, "elapsed_time": "1h 17m 53s", "remaining_time": "10h 18m 3s", "loss_scale": 1.0, "consumed_samples": 182272, "global_step/max_steps": "712/6362"} +{"lm loss": 5.29699707, "grad_norm": 1.16167963, "learning_rate": 9.898e-05, "elapsed_time_per_iteration": 6.84384179, "memory(GiB)": 21.51, "elapsed_time": "1h 18m 0s", "remaining_time": "10h 17m 59s", "loss_scale": 1.0, "consumed_samples": 182528, "global_step/max_steps": "713/6362"} +{"lm loss": 5.28833342, "grad_norm": 1.07583797, "learning_rate": 9.898e-05, "elapsed_time_per_iteration": 6.46545196, "memory(GiB)": 21.51, "elapsed_time": "1h 18m 6s", "remaining_time": "10h 17m 51s", "loss_scale": 1.0, "consumed_samples": 182784, "global_step/max_steps": "714/6362"} +{"lm loss": 5.28380156, "grad_norm": 1.37656379, "learning_rate": 9.897e-05, "elapsed_time_per_iteration": 6.77984715, "memory(GiB)": 21.51, "elapsed_time": "1h 18m 13s", "remaining_time": "10h 17m 46s", "loss_scale": 1.0, "consumed_samples": 183040, "global_step/max_steps": "715/6362"} +{"lm loss": 5.29564762, "grad_norm": 1.16399622, "learning_rate": 9.897e-05, "elapsed_time_per_iteration": 6.87873006, "memory(GiB)": 21.51, "elapsed_time": "1h 18m 20s", "remaining_time": "10h 17m 42s", "loss_scale": 1.0, "consumed_samples": 183296, "global_step/max_steps": "716/6362"} +{"lm loss": 5.27762985, "grad_norm": 1.12714875, "learning_rate": 9.896e-05, "elapsed_time_per_iteration": 6.67242813, "memory(GiB)": 21.51, "elapsed_time": "1h 18m 26s", "remaining_time": "10h 17m 37s", "loss_scale": 1.0, "consumed_samples": 183552, "global_step/max_steps": "717/6362"} +{"lm loss": 5.29317999, "grad_norm": 1.1486119, "learning_rate": 9.896e-05, "elapsed_time_per_iteration": 6.38993621, "memory(GiB)": 21.51, "elapsed_time": "1h 18m 33s", "remaining_time": "10h 17m 29s", "loss_scale": 1.0, "consumed_samples": 183808, "global_step/max_steps": "718/6362"} +{"lm loss": 5.30117941, "grad_norm": 1.04670906, "learning_rate": 9.895e-05, "elapsed_time_per_iteration": 6.97282338, "memory(GiB)": 21.51, "elapsed_time": "1h 18m 40s", "remaining_time": "10h 17m 25s", "loss_scale": 1.0, "consumed_samples": 184064, "global_step/max_steps": "719/6362"} +{"lm loss": 5.26908207, "grad_norm": 1.12465322, "learning_rate": 9.895e-05, "elapsed_time_per_iteration": 6.68302107, "memory(GiB)": 21.51, "elapsed_time": "1h 18m 46s", "remaining_time": "10h 17m 20s", "loss_scale": 1.0, "consumed_samples": 184320, "global_step/max_steps": "720/6362"} +{"lm loss": 5.27692938, "grad_norm": 1.32042336, "learning_rate": 9.894e-05, "elapsed_time_per_iteration": 6.72925711, "memory(GiB)": 21.51, "elapsed_time": "1h 18m 53s", "remaining_time": "10h 17m 14s", "loss_scale": 1.0, "consumed_samples": 184576, "global_step/max_steps": "721/6362"} +{"lm loss": 5.2801013, "grad_norm": 1.01980829, "learning_rate": 9.894e-05, "elapsed_time_per_iteration": 6.75958276, "memory(GiB)": 21.51, "elapsed_time": "1h 19m 0s", "remaining_time": "10h 17m 9s", "loss_scale": 1.0, "consumed_samples": 184832, "global_step/max_steps": "722/6362"} +{"lm loss": 5.291049, "grad_norm": 1.34112918, "learning_rate": 9.893e-05, "elapsed_time_per_iteration": 6.63586545, "memory(GiB)": 21.51, "elapsed_time": "1h 19m 6s", "remaining_time": "10h 17m 3s", "loss_scale": 1.0, "consumed_samples": 185088, "global_step/max_steps": "723/6362"} +{"lm loss": 5.28789139, "grad_norm": 1.1218735, "learning_rate": 9.892e-05, "elapsed_time_per_iteration": 6.81252623, "memory(GiB)": 21.51, "elapsed_time": "1h 19m 13s", "remaining_time": "10h 16m 59s", "loss_scale": 1.0, "consumed_samples": 185344, "global_step/max_steps": "724/6362"} +{"lm loss": 5.28119326, "grad_norm": 1.34090447, "learning_rate": 9.892e-05, "elapsed_time_per_iteration": 6.63413787, "memory(GiB)": 21.51, "elapsed_time": "1h 19m 20s", "remaining_time": "10h 16m 53s", "loss_scale": 1.0, "consumed_samples": 185600, "global_step/max_steps": "725/6362"} +{"lm loss": 5.29222965, "grad_norm": 1.03696573, "learning_rate": 9.891e-05, "elapsed_time_per_iteration": 6.6675787, "memory(GiB)": 21.51, "elapsed_time": "1h 19m 27s", "remaining_time": "10h 16m 47s", "loss_scale": 1.0, "consumed_samples": 185856, "global_step/max_steps": "726/6362"} +{"lm loss": 5.29185247, "grad_norm": 1.41121316, "learning_rate": 9.891e-05, "elapsed_time_per_iteration": 6.91737723, "memory(GiB)": 21.51, "elapsed_time": "1h 19m 34s", "remaining_time": "10h 16m 43s", "loss_scale": 1.0, "consumed_samples": 186112, "global_step/max_steps": "727/6362"} +{"lm loss": 5.31463003, "grad_norm": 1.09050715, "learning_rate": 9.89e-05, "elapsed_time_per_iteration": 6.57137442, "memory(GiB)": 21.51, "elapsed_time": "1h 19m 40s", "remaining_time": "10h 16m 37s", "loss_scale": 1.0, "consumed_samples": 186368, "global_step/max_steps": "728/6362"} +{"lm loss": 5.27797842, "grad_norm": 1.3531388, "learning_rate": 9.89e-05, "elapsed_time_per_iteration": 6.6453042, "memory(GiB)": 21.51, "elapsed_time": "1h 19m 47s", "remaining_time": "10h 16m 31s", "loss_scale": 1.0, "consumed_samples": 186624, "global_step/max_steps": "729/6362"} +{"lm loss": 5.29841471, "grad_norm": 1.22629082, "learning_rate": 9.889e-05, "elapsed_time_per_iteration": 6.68314147, "memory(GiB)": 21.51, "elapsed_time": "1h 19m 53s", "remaining_time": "10h 16m 25s", "loss_scale": 1.0, "consumed_samples": 186880, "global_step/max_steps": "730/6362"} +{"lm loss": 5.28914547, "grad_norm": 1.23388386, "learning_rate": 9.889e-05, "elapsed_time_per_iteration": 6.65734744, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 0s", "remaining_time": "10h 16m 19s", "loss_scale": 1.0, "consumed_samples": 187136, "global_step/max_steps": "731/6362"} +{"lm loss": 5.28387213, "grad_norm": 1.36062813, "learning_rate": 9.888e-05, "elapsed_time_per_iteration": 6.61828017, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 7s", "remaining_time": "10h 16m 13s", "loss_scale": 1.0, "consumed_samples": 187392, "global_step/max_steps": "732/6362"} +{"lm loss": 5.28116131, "grad_norm": 0.99971831, "learning_rate": 9.888e-05, "elapsed_time_per_iteration": 6.52168393, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 13s", "remaining_time": "10h 16m 6s", "loss_scale": 1.0, "consumed_samples": 187648, "global_step/max_steps": "733/6362"} +{"lm loss": 5.29176283, "grad_norm": 1.2556082, "learning_rate": 9.887e-05, "elapsed_time_per_iteration": 6.36276746, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 20s", "remaining_time": "10h 15m 58s", "loss_scale": 1.0, "consumed_samples": 187904, "global_step/max_steps": "734/6362"} +{"lm loss": 5.27380514, "grad_norm": 1.08075261, "learning_rate": 9.887e-05, "elapsed_time_per_iteration": 6.54233837, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 26s", "remaining_time": "10h 15m 51s", "loss_scale": 1.0, "consumed_samples": 188160, "global_step/max_steps": "735/6362"} +{"lm loss": 5.30690718, "grad_norm": 1.30426574, "learning_rate": 9.886e-05, "elapsed_time_per_iteration": 6.41812348, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 33s", "remaining_time": "10h 15m 43s", "loss_scale": 1.0, "consumed_samples": 188416, "global_step/max_steps": "736/6362"} +{"lm loss": 5.29624319, "grad_norm": 1.08699584, "learning_rate": 9.885e-05, "elapsed_time_per_iteration": 6.43785357, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 39s", "remaining_time": "10h 15m 36s", "loss_scale": 1.0, "consumed_samples": 188672, "global_step/max_steps": "737/6362"} +{"lm loss": 5.29556608, "grad_norm": 1.22770417, "learning_rate": 9.885e-05, "elapsed_time_per_iteration": 6.3997004, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 45s", "remaining_time": "10h 15m 28s", "loss_scale": 1.0, "consumed_samples": 188928, "global_step/max_steps": "738/6362"} +{"lm loss": 5.28903198, "grad_norm": 1.21821749, "learning_rate": 9.884e-05, "elapsed_time_per_iteration": 6.59283829, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 52s", "remaining_time": "10h 15m 22s", "loss_scale": 1.0, "consumed_samples": 189184, "global_step/max_steps": "739/6362"} +{"lm loss": 5.28683567, "grad_norm": 1.25990713, "learning_rate": 9.884e-05, "elapsed_time_per_iteration": 6.59890771, "memory(GiB)": 21.51, "elapsed_time": "1h 20m 59s", "remaining_time": "10h 15m 15s", "loss_scale": 1.0, "consumed_samples": 189440, "global_step/max_steps": "740/6362"} +{"lm loss": 5.26903009, "grad_norm": 1.20018196, "learning_rate": 9.883e-05, "elapsed_time_per_iteration": 6.84546041, "memory(GiB)": 21.51, "elapsed_time": "1h 21m 5s", "remaining_time": "10h 15m 11s", "loss_scale": 1.0, "consumed_samples": 189696, "global_step/max_steps": "741/6362"} +{"lm loss": 5.28708887, "grad_norm": 1.03416646, "learning_rate": 9.883e-05, "elapsed_time_per_iteration": 6.5533576, "memory(GiB)": 21.51, "elapsed_time": "1h 21m 12s", "remaining_time": "10h 15m 4s", "loss_scale": 1.0, "consumed_samples": 189952, "global_step/max_steps": "742/6362"} +{"lm loss": 5.26464128, "grad_norm": 1.21981871, "learning_rate": 9.882e-05, "elapsed_time_per_iteration": 6.52073956, "memory(GiB)": 21.51, "elapsed_time": "1h 21m 18s", "remaining_time": "10h 14m 57s", "loss_scale": 1.0, "consumed_samples": 190208, "global_step/max_steps": "743/6362"} +{"lm loss": 5.28843164, "grad_norm": 1.15955675, "learning_rate": 9.882e-05, "elapsed_time_per_iteration": 6.64277387, "memory(GiB)": 21.51, "elapsed_time": "1h 21m 25s", "remaining_time": "10h 14m 51s", "loss_scale": 1.0, "consumed_samples": 190464, "global_step/max_steps": "744/6362"} +{"lm loss": 5.26786661, "grad_norm": 1.09102368, "learning_rate": 9.881e-05, "elapsed_time_per_iteration": 6.59114885, "memory(GiB)": 21.51, "elapsed_time": "1h 21m 32s", "remaining_time": "10h 14m 45s", "loss_scale": 1.0, "consumed_samples": 190720, "global_step/max_steps": "745/6362"} +{"lm loss": 5.28998995, "grad_norm": 1.21369505, "learning_rate": 9.881e-05, "elapsed_time_per_iteration": 6.63326907, "memory(GiB)": 21.51, "elapsed_time": "1h 21m 38s", "remaining_time": "10h 14m 39s", "loss_scale": 1.0, "consumed_samples": 190976, "global_step/max_steps": "746/6362"} +{"lm loss": 5.30309486, "grad_norm": 1.03537655, "learning_rate": 9.88e-05, "elapsed_time_per_iteration": 6.37602067, "memory(GiB)": 21.51, "elapsed_time": "1h 21m 45s", "remaining_time": "10h 14m 31s", "loss_scale": 1.0, "consumed_samples": 191232, "global_step/max_steps": "747/6362"} +{"lm loss": 5.27924442, "grad_norm": 1.19857419, "learning_rate": 9.879e-05, "elapsed_time_per_iteration": 6.51506686, "memory(GiB)": 21.51, "elapsed_time": "1h 21m 51s", "remaining_time": "10h 14m 24s", "loss_scale": 1.0, "consumed_samples": 191488, "global_step/max_steps": "748/6362"} +{"lm loss": 5.27990198, "grad_norm": 0.94415301, "learning_rate": 9.879e-05, "elapsed_time_per_iteration": 6.28569031, "memory(GiB)": 21.51, "elapsed_time": "1h 21m 58s", "remaining_time": "10h 14m 15s", "loss_scale": 1.0, "consumed_samples": 191744, "global_step/max_steps": "749/6362"} +{"lm loss": 5.27749729, "grad_norm": 1.2922287, "learning_rate": 9.878e-05, "elapsed_time_per_iteration": 6.36553192, "memory(GiB)": 21.51, "elapsed_time": "1h 22m 4s", "remaining_time": "10h 14m 7s", "loss_scale": 1.0, "consumed_samples": 192000, "global_step/max_steps": "750/6362"} +{"lm loss": 5.2652936, "grad_norm": 1.39142501, "learning_rate": 9.878e-05, "elapsed_time_per_iteration": 6.64486074, "memory(GiB)": 21.51, "elapsed_time": "1h 22m 11s", "remaining_time": "10h 14m 1s", "loss_scale": 1.0, "consumed_samples": 192256, "global_step/max_steps": "751/6362"} +{"lm loss": 5.25434875, "grad_norm": 0.99904346, "learning_rate": 9.877e-05, "elapsed_time_per_iteration": 6.54060197, "memory(GiB)": 21.51, "elapsed_time": "1h 22m 17s", "remaining_time": "10h 13m 54s", "loss_scale": 1.0, "consumed_samples": 192512, "global_step/max_steps": "752/6362"} +{"lm loss": 5.28278971, "grad_norm": 1.12340987, "learning_rate": 9.877e-05, "elapsed_time_per_iteration": 6.59247875, "memory(GiB)": 21.51, "elapsed_time": "1h 22m 24s", "remaining_time": "10h 13m 48s", "loss_scale": 1.0, "consumed_samples": 192768, "global_step/max_steps": "753/6362"} +{"lm loss": 5.27004385, "grad_norm": 1.09133112, "learning_rate": 9.876e-05, "elapsed_time_per_iteration": 6.5584259, "memory(GiB)": 21.51, "elapsed_time": "1h 22m 30s", "remaining_time": "10h 13m 41s", "loss_scale": 1.0, "consumed_samples": 193024, "global_step/max_steps": "754/6362"} +{"lm loss": 5.2897625, "grad_norm": 1.13040841, "learning_rate": 9.875e-05, "elapsed_time_per_iteration": 6.71501088, "memory(GiB)": 21.51, "elapsed_time": "1h 22m 37s", "remaining_time": "10h 13m 36s", "loss_scale": 1.0, "consumed_samples": 193280, "global_step/max_steps": "755/6362"} +{"lm loss": 5.26052856, "grad_norm": 1.24381375, "learning_rate": 9.875e-05, "elapsed_time_per_iteration": 6.64823604, "memory(GiB)": 21.51, "elapsed_time": "1h 22m 44s", "remaining_time": "10h 13m 30s", "loss_scale": 1.0, "consumed_samples": 193536, "global_step/max_steps": "756/6362"} +{"lm loss": 5.29635286, "grad_norm": 1.29654849, "learning_rate": 9.874e-05, "elapsed_time_per_iteration": 6.58768439, "memory(GiB)": 21.51, "elapsed_time": "1h 22m 50s", "remaining_time": "10h 13m 24s", "loss_scale": 1.0, "consumed_samples": 193792, "global_step/max_steps": "757/6362"} +{"lm loss": 5.25892115, "grad_norm": 1.13432372, "learning_rate": 9.874e-05, "elapsed_time_per_iteration": 6.61989141, "memory(GiB)": 21.51, "elapsed_time": "1h 22m 57s", "remaining_time": "10h 13m 17s", "loss_scale": 1.0, "consumed_samples": 194048, "global_step/max_steps": "758/6362"} +{"lm loss": 5.29268694, "grad_norm": 1.36903894, "learning_rate": 9.873e-05, "elapsed_time_per_iteration": 6.65363812, "memory(GiB)": 21.51, "elapsed_time": "1h 23m 3s", "remaining_time": "10h 13m 12s", "loss_scale": 1.0, "consumed_samples": 194304, "global_step/max_steps": "759/6362"} +{"lm loss": 5.27029467, "grad_norm": 1.16424227, "learning_rate": 9.873e-05, "elapsed_time_per_iteration": 6.55977368, "memory(GiB)": 21.51, "elapsed_time": "1h 23m 10s", "remaining_time": "10h 13m 5s", "loss_scale": 1.0, "consumed_samples": 194560, "global_step/max_steps": "760/6362"} +{"lm loss": 5.27655935, "grad_norm": 1.10225487, "learning_rate": 9.872e-05, "elapsed_time_per_iteration": 6.72305036, "memory(GiB)": 21.51, "elapsed_time": "1h 23m 17s", "remaining_time": "10h 12m 59s", "loss_scale": 1.0, "consumed_samples": 194816, "global_step/max_steps": "761/6362"} +{"lm loss": 5.30825377, "grad_norm": 1.15168536, "learning_rate": 9.871e-05, "elapsed_time_per_iteration": 6.7186563, "memory(GiB)": 21.51, "elapsed_time": "1h 23m 23s", "remaining_time": "10h 12m 54s", "loss_scale": 1.0, "consumed_samples": 195072, "global_step/max_steps": "762/6362"} +{"lm loss": 5.27874231, "grad_norm": 1.25751078, "learning_rate": 9.871e-05, "elapsed_time_per_iteration": 6.53765941, "memory(GiB)": 21.51, "elapsed_time": "1h 23m 30s", "remaining_time": "10h 12m 47s", "loss_scale": 1.0, "consumed_samples": 195328, "global_step/max_steps": "763/6362"} +{"lm loss": 5.26780319, "grad_norm": 1.10757101, "learning_rate": 9.87e-05, "elapsed_time_per_iteration": 6.45707345, "memory(GiB)": 21.51, "elapsed_time": "1h 23m 36s", "remaining_time": "10h 12m 40s", "loss_scale": 1.0, "consumed_samples": 195584, "global_step/max_steps": "764/6362"} +{"lm loss": 5.27837992, "grad_norm": 1.06280828, "learning_rate": 9.87e-05, "elapsed_time_per_iteration": 6.91025066, "memory(GiB)": 21.51, "elapsed_time": "1h 23m 43s", "remaining_time": "10h 12m 36s", "loss_scale": 1.0, "consumed_samples": 195840, "global_step/max_steps": "765/6362"} +{"lm loss": 5.28884315, "grad_norm": 1.13229966, "learning_rate": 9.869e-05, "elapsed_time_per_iteration": 6.4922905, "memory(GiB)": 21.51, "elapsed_time": "1h 23m 50s", "remaining_time": "10h 12m 29s", "loss_scale": 1.0, "consumed_samples": 196096, "global_step/max_steps": "766/6362"} +{"lm loss": 5.27012825, "grad_norm": 1.03487694, "learning_rate": 9.869e-05, "elapsed_time_per_iteration": 6.58289027, "memory(GiB)": 21.51, "elapsed_time": "1h 23m 56s", "remaining_time": "10h 12m 22s", "loss_scale": 1.0, "consumed_samples": 196352, "global_step/max_steps": "767/6362"} +{"lm loss": 5.25806093, "grad_norm": 1.1953032, "learning_rate": 9.868e-05, "elapsed_time_per_iteration": 6.49911141, "memory(GiB)": 21.51, "elapsed_time": "1h 24m 3s", "remaining_time": "10h 12m 15s", "loss_scale": 1.0, "consumed_samples": 196608, "global_step/max_steps": "768/6362"} +{"lm loss": 5.27197456, "grad_norm": 1.17005455, "learning_rate": 9.867e-05, "elapsed_time_per_iteration": 6.71602774, "memory(GiB)": 21.51, "elapsed_time": "1h 24m 10s", "remaining_time": "10h 12m 10s", "loss_scale": 1.0, "consumed_samples": 196864, "global_step/max_steps": "769/6362"} +{"lm loss": 5.24302197, "grad_norm": 1.22106469, "learning_rate": 9.867e-05, "elapsed_time_per_iteration": 6.36965227, "memory(GiB)": 21.51, "elapsed_time": "1h 24m 16s", "remaining_time": "10h 12m 2s", "loss_scale": 1.0, "consumed_samples": 197120, "global_step/max_steps": "770/6362"} +{"lm loss": 5.25578403, "grad_norm": 1.14226758, "learning_rate": 9.866e-05, "elapsed_time_per_iteration": 6.75525713, "memory(GiB)": 21.51, "elapsed_time": "1h 24m 23s", "remaining_time": "10h 11m 57s", "loss_scale": 1.0, "consumed_samples": 197376, "global_step/max_steps": "771/6362"} +{"lm loss": 5.27364635, "grad_norm": 1.27583647, "learning_rate": 9.866e-05, "elapsed_time_per_iteration": 6.42850041, "memory(GiB)": 21.51, "elapsed_time": "1h 24m 29s", "remaining_time": "10h 11m 49s", "loss_scale": 1.0, "consumed_samples": 197632, "global_step/max_steps": "772/6362"} +{"lm loss": 5.28273106, "grad_norm": 1.16538441, "learning_rate": 9.865e-05, "elapsed_time_per_iteration": 6.85565424, "memory(GiB)": 21.51, "elapsed_time": "1h 24m 36s", "remaining_time": "10h 11m 44s", "loss_scale": 1.0, "consumed_samples": 197888, "global_step/max_steps": "773/6362"} +{"lm loss": 5.27880764, "grad_norm": 1.3932538, "learning_rate": 9.864e-05, "elapsed_time_per_iteration": 6.31139922, "memory(GiB)": 21.51, "elapsed_time": "1h 24m 42s", "remaining_time": "10h 11m 36s", "loss_scale": 1.0, "consumed_samples": 198144, "global_step/max_steps": "774/6362"} +{"lm loss": 5.27618265, "grad_norm": 1.03634536, "learning_rate": 9.864e-05, "elapsed_time_per_iteration": 6.48772526, "memory(GiB)": 21.51, "elapsed_time": "1h 24m 49s", "remaining_time": "10h 11m 29s", "loss_scale": 1.0, "consumed_samples": 198400, "global_step/max_steps": "775/6362"} +{"lm loss": 5.27819347, "grad_norm": 1.27931464, "learning_rate": 9.863e-05, "elapsed_time_per_iteration": 6.66129303, "memory(GiB)": 21.51, "elapsed_time": "1h 24m 56s", "remaining_time": "10h 11m 23s", "loss_scale": 1.0, "consumed_samples": 198656, "global_step/max_steps": "776/6362"} +{"lm loss": 5.26868248, "grad_norm": 1.0890559, "learning_rate": 9.863e-05, "elapsed_time_per_iteration": 6.58278513, "memory(GiB)": 21.51, "elapsed_time": "1h 25m 2s", "remaining_time": "10h 11m 17s", "loss_scale": 1.0, "consumed_samples": 198912, "global_step/max_steps": "777/6362"} +{"lm loss": 5.25291777, "grad_norm": 1.35963404, "learning_rate": 9.862e-05, "elapsed_time_per_iteration": 6.62872744, "memory(GiB)": 21.51, "elapsed_time": "1h 25m 9s", "remaining_time": "10h 11m 10s", "loss_scale": 1.0, "consumed_samples": 199168, "global_step/max_steps": "778/6362"} +{"lm loss": 5.25494576, "grad_norm": 0.93259299, "learning_rate": 9.861e-05, "elapsed_time_per_iteration": 6.4655993, "memory(GiB)": 21.51, "elapsed_time": "1h 25m 15s", "remaining_time": "10h 11m 3s", "loss_scale": 1.0, "consumed_samples": 199424, "global_step/max_steps": "779/6362"} +{"lm loss": 5.27253866, "grad_norm": 1.25337934, "learning_rate": 9.861e-05, "elapsed_time_per_iteration": 6.51545811, "memory(GiB)": 21.51, "elapsed_time": "1h 25m 22s", "remaining_time": "10h 10m 56s", "loss_scale": 1.0, "consumed_samples": 199680, "global_step/max_steps": "780/6362"} +{"lm loss": 5.2818799, "grad_norm": 1.402596, "learning_rate": 9.86e-05, "elapsed_time_per_iteration": 6.40284252, "memory(GiB)": 21.51, "elapsed_time": "1h 25m 28s", "remaining_time": "10h 10m 48s", "loss_scale": 1.0, "consumed_samples": 199936, "global_step/max_steps": "781/6362"} +{"lm loss": 5.24875212, "grad_norm": 1.0696981, "learning_rate": 9.86e-05, "elapsed_time_per_iteration": 6.48335528, "memory(GiB)": 21.51, "elapsed_time": "1h 25m 35s", "remaining_time": "10h 10m 41s", "loss_scale": 1.0, "consumed_samples": 200192, "global_step/max_steps": "782/6362"} +{"lm loss": 5.25591898, "grad_norm": 1.15524673, "learning_rate": 9.859e-05, "elapsed_time_per_iteration": 6.54619884, "memory(GiB)": 21.51, "elapsed_time": "1h 25m 41s", "remaining_time": "10h 10m 35s", "loss_scale": 1.0, "consumed_samples": 200448, "global_step/max_steps": "783/6362"} +{"lm loss": 5.27460957, "grad_norm": 0.96052772, "learning_rate": 9.858e-05, "elapsed_time_per_iteration": 6.49622011, "memory(GiB)": 21.51, "elapsed_time": "1h 25m 48s", "remaining_time": "10h 10m 28s", "loss_scale": 1.0, "consumed_samples": 200704, "global_step/max_steps": "784/6362"} +{"lm loss": 5.25323391, "grad_norm": 1.18959737, "learning_rate": 9.858e-05, "elapsed_time_per_iteration": 6.78749275, "memory(GiB)": 21.51, "elapsed_time": "1h 25m 54s", "remaining_time": "10h 10m 23s", "loss_scale": 1.0, "consumed_samples": 200960, "global_step/max_steps": "785/6362"} +{"lm loss": 5.26856947, "grad_norm": 1.11759448, "learning_rate": 9.857e-05, "elapsed_time_per_iteration": 6.65505886, "memory(GiB)": 21.51, "elapsed_time": "1h 26m 1s", "remaining_time": "10h 10m 17s", "loss_scale": 1.0, "consumed_samples": 201216, "global_step/max_steps": "786/6362"} +{"lm loss": 5.25132895, "grad_norm": 1.1477648, "learning_rate": 9.857e-05, "elapsed_time_per_iteration": 6.47813344, "memory(GiB)": 21.51, "elapsed_time": "1h 26m 8s", "remaining_time": "10h 10m 9s", "loss_scale": 1.0, "consumed_samples": 201472, "global_step/max_steps": "787/6362"} +{"lm loss": 5.25149679, "grad_norm": 1.11865079, "learning_rate": 9.856e-05, "elapsed_time_per_iteration": 6.38186622, "memory(GiB)": 21.51, "elapsed_time": "1h 26m 14s", "remaining_time": "10h 10m 2s", "loss_scale": 1.0, "consumed_samples": 201728, "global_step/max_steps": "788/6362"} +{"lm loss": 5.27509069, "grad_norm": 1.36047006, "learning_rate": 9.855e-05, "elapsed_time_per_iteration": 6.45829916, "memory(GiB)": 21.51, "elapsed_time": "1h 26m 20s", "remaining_time": "10h 9m 54s", "loss_scale": 1.0, "consumed_samples": 201984, "global_step/max_steps": "789/6362"} +{"lm loss": 5.26898193, "grad_norm": 1.08937919, "learning_rate": 9.855e-05, "elapsed_time_per_iteration": 6.51655626, "memory(GiB)": 21.51, "elapsed_time": "1h 26m 27s", "remaining_time": "10h 9m 47s", "loss_scale": 1.0, "consumed_samples": 202240, "global_step/max_steps": "790/6362"} +{"lm loss": 5.27187538, "grad_norm": 1.16920841, "learning_rate": 9.854e-05, "elapsed_time_per_iteration": 6.70278049, "memory(GiB)": 21.51, "elapsed_time": "1h 26m 34s", "remaining_time": "10h 9m 42s", "loss_scale": 1.0, "consumed_samples": 202496, "global_step/max_steps": "791/6362"} +{"lm loss": 5.24033213, "grad_norm": 0.97873688, "learning_rate": 9.854e-05, "elapsed_time_per_iteration": 6.70795035, "memory(GiB)": 21.51, "elapsed_time": "1h 26m 40s", "remaining_time": "10h 9m 36s", "loss_scale": 1.0, "consumed_samples": 202752, "global_step/max_steps": "792/6362"} +{"lm loss": 5.25921917, "grad_norm": 1.41990006, "learning_rate": 9.853e-05, "elapsed_time_per_iteration": 6.78733635, "memory(GiB)": 21.51, "elapsed_time": "1h 26m 47s", "remaining_time": "10h 9m 31s", "loss_scale": 1.0, "consumed_samples": 203008, "global_step/max_steps": "793/6362"} +{"lm loss": 5.26479959, "grad_norm": 0.89669347, "learning_rate": 9.852e-05, "elapsed_time_per_iteration": 6.55136323, "memory(GiB)": 21.51, "elapsed_time": "1h 26m 54s", "remaining_time": "10h 9m 24s", "loss_scale": 1.0, "consumed_samples": 203264, "global_step/max_steps": "794/6362"} +{"lm loss": 5.2677722, "grad_norm": 0.88337797, "learning_rate": 9.852e-05, "elapsed_time_per_iteration": 6.4498229, "memory(GiB)": 21.51, "elapsed_time": "1h 27m 0s", "remaining_time": "10h 9m 17s", "loss_scale": 1.0, "consumed_samples": 203520, "global_step/max_steps": "795/6362"} +{"lm loss": 5.25697708, "grad_norm": 1.01119244, "learning_rate": 9.851e-05, "elapsed_time_per_iteration": 6.54435515, "memory(GiB)": 21.51, "elapsed_time": "1h 27m 7s", "remaining_time": "10h 9m 10s", "loss_scale": 1.0, "consumed_samples": 203776, "global_step/max_steps": "796/6362"} +{"lm loss": 5.27370262, "grad_norm": 1.38818157, "learning_rate": 9.851e-05, "elapsed_time_per_iteration": 6.54370737, "memory(GiB)": 21.51, "elapsed_time": "1h 27m 13s", "remaining_time": "10h 9m 4s", "loss_scale": 1.0, "consumed_samples": 204032, "global_step/max_steps": "797/6362"} +{"lm loss": 5.25856066, "grad_norm": 0.9453302, "learning_rate": 9.85e-05, "elapsed_time_per_iteration": 6.42438841, "memory(GiB)": 21.51, "elapsed_time": "1h 27m 20s", "remaining_time": "10h 8m 56s", "loss_scale": 1.0, "consumed_samples": 204288, "global_step/max_steps": "798/6362"} +{"lm loss": 5.24723959, "grad_norm": 1.33826482, "learning_rate": 9.849e-05, "elapsed_time_per_iteration": 6.81608391, "memory(GiB)": 21.51, "elapsed_time": "1h 27m 26s", "remaining_time": "10h 8m 51s", "loss_scale": 1.0, "consumed_samples": 204544, "global_step/max_steps": "799/6362"} +{"lm loss": 5.23716497, "grad_norm": 0.98899674, "learning_rate": 9.849e-05, "elapsed_time_per_iteration": 6.53161263, "memory(GiB)": 21.51, "elapsed_time": "1h 27m 33s", "remaining_time": "10h 8m 44s", "loss_scale": 1.0, "consumed_samples": 204800, "global_step/max_steps": "800/6362"} +{"lm loss": 5.25625515, "grad_norm": 1.07683682, "learning_rate": 9.848e-05, "elapsed_time_per_iteration": 6.53891277, "memory(GiB)": 21.51, "elapsed_time": "1h 27m 40s", "remaining_time": "10h 8m 38s", "loss_scale": 1.0, "consumed_samples": 205056, "global_step/max_steps": "801/6362"} +{"lm loss": 5.27263927, "grad_norm": 1.31387424, "learning_rate": 9.847e-05, "elapsed_time_per_iteration": 6.83124995, "memory(GiB)": 21.51, "elapsed_time": "1h 27m 46s", "remaining_time": "10h 8m 33s", "loss_scale": 1.0, "consumed_samples": 205312, "global_step/max_steps": "802/6362"} +{"lm loss": 5.2591548, "grad_norm": 1.03855562, "learning_rate": 9.847e-05, "elapsed_time_per_iteration": 6.67765617, "memory(GiB)": 21.51, "elapsed_time": "1h 27m 53s", "remaining_time": "10h 8m 27s", "loss_scale": 1.0, "consumed_samples": 205568, "global_step/max_steps": "803/6362"} +{"lm loss": 5.26542521, "grad_norm": 1.22720253, "learning_rate": 9.846e-05, "elapsed_time_per_iteration": 6.64224839, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 0s", "remaining_time": "10h 8m 21s", "loss_scale": 1.0, "consumed_samples": 205824, "global_step/max_steps": "804/6362"} +{"lm loss": 5.2413702, "grad_norm": 1.06299996, "learning_rate": 9.845e-05, "elapsed_time_per_iteration": 6.69578624, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 6s", "remaining_time": "10h 8m 15s", "loss_scale": 1.0, "consumed_samples": 206080, "global_step/max_steps": "805/6362"} +{"lm loss": 5.27493, "grad_norm": 1.27736568, "learning_rate": 9.845e-05, "elapsed_time_per_iteration": 6.52358031, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 13s", "remaining_time": "10h 8m 8s", "loss_scale": 1.0, "consumed_samples": 206336, "global_step/max_steps": "806/6362"} +{"lm loss": 5.26468182, "grad_norm": 0.90851116, "learning_rate": 9.844e-05, "elapsed_time_per_iteration": 6.63856792, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 20s", "remaining_time": "10h 8m 2s", "loss_scale": 1.0, "consumed_samples": 206592, "global_step/max_steps": "807/6362"} +{"lm loss": 5.26228237, "grad_norm": 0.86216718, "learning_rate": 9.844e-05, "elapsed_time_per_iteration": 6.38298392, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 26s", "remaining_time": "10h 7m 55s", "loss_scale": 1.0, "consumed_samples": 206848, "global_step/max_steps": "808/6362"} +{"lm loss": 5.24278116, "grad_norm": 1.21125901, "learning_rate": 9.843e-05, "elapsed_time_per_iteration": 6.48034096, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 32s", "remaining_time": "10h 7m 47s", "loss_scale": 1.0, "consumed_samples": 207104, "global_step/max_steps": "809/6362"} +{"lm loss": 5.24542141, "grad_norm": 1.29512846, "learning_rate": 9.842e-05, "elapsed_time_per_iteration": 6.30108643, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 39s", "remaining_time": "10h 7m 39s", "loss_scale": 1.0, "consumed_samples": 207360, "global_step/max_steps": "810/6362"} +{"lm loss": 5.26405907, "grad_norm": 1.23714447, "learning_rate": 9.842e-05, "elapsed_time_per_iteration": 6.54919076, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 45s", "remaining_time": "10h 7m 32s", "loss_scale": 1.0, "consumed_samples": 207616, "global_step/max_steps": "811/6362"} +{"lm loss": 5.24667835, "grad_norm": 1.29436302, "learning_rate": 9.841e-05, "elapsed_time_per_iteration": 6.38129973, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 52s", "remaining_time": "10h 7m 24s", "loss_scale": 1.0, "consumed_samples": 207872, "global_step/max_steps": "812/6362"} +{"lm loss": 5.233325, "grad_norm": 0.97593975, "learning_rate": 9.84e-05, "elapsed_time_per_iteration": 6.47095847, "memory(GiB)": 21.51, "elapsed_time": "1h 28m 58s", "remaining_time": "10h 7m 17s", "loss_scale": 1.0, "consumed_samples": 208128, "global_step/max_steps": "813/6362"} +{"lm loss": 5.27439451, "grad_norm": 1.18507922, "learning_rate": 9.84e-05, "elapsed_time_per_iteration": 6.43822813, "memory(GiB)": 21.51, "elapsed_time": "1h 29m 5s", "remaining_time": "10h 7m 10s", "loss_scale": 1.0, "consumed_samples": 208384, "global_step/max_steps": "814/6362"} +{"lm loss": 5.25311184, "grad_norm": 1.07238603, "learning_rate": 9.839e-05, "elapsed_time_per_iteration": 6.62232327, "memory(GiB)": 21.51, "elapsed_time": "1h 29m 11s", "remaining_time": "10h 7m 4s", "loss_scale": 1.0, "consumed_samples": 208640, "global_step/max_steps": "815/6362"} +{"lm loss": 5.24151087, "grad_norm": 1.28298545, "learning_rate": 9.838e-05, "elapsed_time_per_iteration": 6.50145483, "memory(GiB)": 21.51, "elapsed_time": "1h 29m 18s", "remaining_time": "10h 6m 57s", "loss_scale": 1.0, "consumed_samples": 208896, "global_step/max_steps": "816/6362"} +{"lm loss": 5.25191355, "grad_norm": 1.10858846, "learning_rate": 9.838e-05, "elapsed_time_per_iteration": 6.60395432, "memory(GiB)": 21.51, "elapsed_time": "1h 29m 24s", "remaining_time": "10h 6m 50s", "loss_scale": 1.0, "consumed_samples": 209152, "global_step/max_steps": "817/6362"} +{"lm loss": 5.26504993, "grad_norm": 1.14396906, "learning_rate": 9.837e-05, "elapsed_time_per_iteration": 6.74397659, "memory(GiB)": 21.51, "elapsed_time": "1h 29m 31s", "remaining_time": "10h 6m 45s", "loss_scale": 1.0, "consumed_samples": 209408, "global_step/max_steps": "818/6362"} +{"lm loss": 5.2634201, "grad_norm": 1.10589814, "learning_rate": 9.837e-05, "elapsed_time_per_iteration": 6.86129975, "memory(GiB)": 21.51, "elapsed_time": "1h 29m 38s", "remaining_time": "10h 6m 40s", "loss_scale": 1.0, "consumed_samples": 209664, "global_step/max_steps": "819/6362"} +{"lm loss": 5.25747347, "grad_norm": 1.15471196, "learning_rate": 9.836e-05, "elapsed_time_per_iteration": 6.57308054, "memory(GiB)": 21.51, "elapsed_time": "1h 29m 44s", "remaining_time": "10h 6m 34s", "loss_scale": 1.0, "consumed_samples": 209920, "global_step/max_steps": "820/6362"} +{"lm loss": 5.27969027, "grad_norm": 1.16556418, "learning_rate": 9.835e-05, "elapsed_time_per_iteration": 6.59933424, "memory(GiB)": 21.51, "elapsed_time": "1h 29m 51s", "remaining_time": "10h 6m 27s", "loss_scale": 1.0, "consumed_samples": 210176, "global_step/max_steps": "821/6362"} +{"lm loss": 5.25032425, "grad_norm": 1.29842317, "learning_rate": 9.835e-05, "elapsed_time_per_iteration": 6.54608798, "memory(GiB)": 21.51, "elapsed_time": "1h 29m 58s", "remaining_time": "10h 6m 21s", "loss_scale": 1.0, "consumed_samples": 210432, "global_step/max_steps": "822/6362"} +{"lm loss": 5.24967003, "grad_norm": 0.88257557, "learning_rate": 9.834e-05, "elapsed_time_per_iteration": 6.51921582, "memory(GiB)": 21.51, "elapsed_time": "1h 30m 4s", "remaining_time": "10h 6m 14s", "loss_scale": 1.0, "consumed_samples": 210688, "global_step/max_steps": "823/6362"} +{"lm loss": 5.2457962, "grad_norm": 0.87703979, "learning_rate": 9.833e-05, "elapsed_time_per_iteration": 6.45169687, "memory(GiB)": 21.51, "elapsed_time": "1h 30m 11s", "remaining_time": "10h 6m 7s", "loss_scale": 1.0, "consumed_samples": 210944, "global_step/max_steps": "824/6362"} +{"lm loss": 5.26406336, "grad_norm": 1.09472847, "learning_rate": 9.833e-05, "elapsed_time_per_iteration": 6.42370772, "memory(GiB)": 21.51, "elapsed_time": "1h 30m 17s", "remaining_time": "10h 5m 59s", "loss_scale": 1.0, "consumed_samples": 211200, "global_step/max_steps": "825/6362"} +{"lm loss": 5.25095987, "grad_norm": 1.17776966, "learning_rate": 9.832e-05, "elapsed_time_per_iteration": 6.60680652, "memory(GiB)": 21.51, "elapsed_time": "1h 30m 24s", "remaining_time": "10h 5m 53s", "loss_scale": 1.0, "consumed_samples": 211456, "global_step/max_steps": "826/6362"} +{"lm loss": 5.26047754, "grad_norm": 1.10309553, "learning_rate": 9.831e-05, "elapsed_time_per_iteration": 6.79007196, "memory(GiB)": 21.51, "elapsed_time": "1h 30m 30s", "remaining_time": "10h 5m 48s", "loss_scale": 1.0, "consumed_samples": 211712, "global_step/max_steps": "827/6362"} +{"lm loss": 5.23774719, "grad_norm": 1.41383195, "learning_rate": 9.831e-05, "elapsed_time_per_iteration": 6.58248019, "memory(GiB)": 21.51, "elapsed_time": "1h 30m 37s", "remaining_time": "10h 5m 41s", "loss_scale": 1.0, "consumed_samples": 211968, "global_step/max_steps": "828/6362"} +{"lm loss": 5.26655293, "grad_norm": 1.09495068, "learning_rate": 9.83e-05, "elapsed_time_per_iteration": 6.43996358, "memory(GiB)": 21.51, "elapsed_time": "1h 30m 43s", "remaining_time": "10h 5m 34s", "loss_scale": 1.0, "consumed_samples": 212224, "global_step/max_steps": "829/6362"} +{"lm loss": 5.25311804, "grad_norm": 1.13144457, "learning_rate": 9.829e-05, "elapsed_time_per_iteration": 6.53531599, "memory(GiB)": 21.51, "elapsed_time": "1h 30m 50s", "remaining_time": "10h 5m 27s", "loss_scale": 1.0, "consumed_samples": 212480, "global_step/max_steps": "830/6362"} +{"lm loss": 5.24820662, "grad_norm": 1.04650044, "learning_rate": 9.829e-05, "elapsed_time_per_iteration": 6.55963397, "memory(GiB)": 21.51, "elapsed_time": "1h 30m 56s", "remaining_time": "10h 5m 20s", "loss_scale": 1.0, "consumed_samples": 212736, "global_step/max_steps": "831/6362"} +{"lm loss": 5.2515707, "grad_norm": 1.29138911, "learning_rate": 9.828e-05, "elapsed_time_per_iteration": 6.56174159, "memory(GiB)": 21.51, "elapsed_time": "1h 31m 3s", "remaining_time": "10h 5m 14s", "loss_scale": 1.0, "consumed_samples": 212992, "global_step/max_steps": "832/6362"} +{"lm loss": 5.23446035, "grad_norm": 1.07638013, "learning_rate": 9.827e-05, "elapsed_time_per_iteration": 6.60129094, "memory(GiB)": 21.51, "elapsed_time": "1h 31m 10s", "remaining_time": "10h 5m 7s", "loss_scale": 1.0, "consumed_samples": 213248, "global_step/max_steps": "833/6362"} +{"lm loss": 5.23441553, "grad_norm": 1.08631647, "learning_rate": 9.827e-05, "elapsed_time_per_iteration": 6.60847139, "memory(GiB)": 21.51, "elapsed_time": "1h 31m 16s", "remaining_time": "10h 5m 1s", "loss_scale": 1.0, "consumed_samples": 213504, "global_step/max_steps": "834/6362"} +{"lm loss": 5.2405138, "grad_norm": 1.07370996, "learning_rate": 9.826e-05, "elapsed_time_per_iteration": 6.51023626, "memory(GiB)": 21.51, "elapsed_time": "1h 31m 23s", "remaining_time": "10h 4m 54s", "loss_scale": 1.0, "consumed_samples": 213760, "global_step/max_steps": "835/6362"} +{"lm loss": 5.25486231, "grad_norm": 1.1540277, "learning_rate": 9.825e-05, "elapsed_time_per_iteration": 6.60295129, "memory(GiB)": 21.51, "elapsed_time": "1h 31m 29s", "remaining_time": "10h 4m 48s", "loss_scale": 1.0, "consumed_samples": 214016, "global_step/max_steps": "836/6362"} +{"lm loss": 5.24589634, "grad_norm": 1.21021414, "learning_rate": 9.825e-05, "elapsed_time_per_iteration": 6.52820039, "memory(GiB)": 21.51, "elapsed_time": "1h 31m 36s", "remaining_time": "10h 4m 41s", "loss_scale": 1.0, "consumed_samples": 214272, "global_step/max_steps": "837/6362"} +{"lm loss": 5.26136065, "grad_norm": 1.08611131, "learning_rate": 9.824e-05, "elapsed_time_per_iteration": 6.95781136, "memory(GiB)": 21.51, "elapsed_time": "1h 31m 43s", "remaining_time": "10h 4m 37s", "loss_scale": 1.0, "consumed_samples": 214528, "global_step/max_steps": "838/6362"} +{"lm loss": 5.25156355, "grad_norm": 1.16597676, "learning_rate": 9.823e-05, "elapsed_time_per_iteration": 6.74119377, "memory(GiB)": 21.51, "elapsed_time": "1h 31m 50s", "remaining_time": "10h 4m 32s", "loss_scale": 1.0, "consumed_samples": 214784, "global_step/max_steps": "839/6362"} +{"lm loss": 5.24988174, "grad_norm": 1.08281946, "learning_rate": 9.823e-05, "elapsed_time_per_iteration": 6.62671781, "memory(GiB)": 21.51, "elapsed_time": "1h 31m 56s", "remaining_time": "10h 4m 25s", "loss_scale": 1.0, "consumed_samples": 215040, "global_step/max_steps": "840/6362"} +{"lm loss": 5.25538635, "grad_norm": 1.13872242, "learning_rate": 9.822e-05, "elapsed_time_per_iteration": 6.49927592, "memory(GiB)": 21.51, "elapsed_time": "1h 32m 3s", "remaining_time": "10h 4m 18s", "loss_scale": 1.0, "consumed_samples": 215296, "global_step/max_steps": "841/6362"} +{"lm loss": 5.24103689, "grad_norm": 1.09029078, "learning_rate": 9.821e-05, "elapsed_time_per_iteration": 6.84332752, "memory(GiB)": 21.51, "elapsed_time": "1h 32m 10s", "remaining_time": "10h 4m 14s", "loss_scale": 1.0, "consumed_samples": 215552, "global_step/max_steps": "842/6362"} +{"lm loss": 5.23786354, "grad_norm": 1.15398121, "learning_rate": 9.821e-05, "elapsed_time_per_iteration": 6.57739663, "memory(GiB)": 21.51, "elapsed_time": "1h 32m 16s", "remaining_time": "10h 4m 7s", "loss_scale": 1.0, "consumed_samples": 215808, "global_step/max_steps": "843/6362"} +{"lm loss": 5.24564981, "grad_norm": 1.03964186, "learning_rate": 9.82e-05, "elapsed_time_per_iteration": 6.92760277, "memory(GiB)": 21.51, "elapsed_time": "1h 32m 23s", "remaining_time": "10h 4m 3s", "loss_scale": 1.0, "consumed_samples": 216064, "global_step/max_steps": "844/6362"} +{"lm loss": 5.26018667, "grad_norm": 1.25522828, "learning_rate": 9.819e-05, "elapsed_time_per_iteration": 6.52329493, "memory(GiB)": 21.51, "elapsed_time": "1h 32m 30s", "remaining_time": "10h 3m 56s", "loss_scale": 1.0, "consumed_samples": 216320, "global_step/max_steps": "845/6362"} +{"lm loss": 5.24383211, "grad_norm": 1.1752373, "learning_rate": 9.819e-05, "elapsed_time_per_iteration": 6.79768991, "memory(GiB)": 21.51, "elapsed_time": "1h 32m 36s", "remaining_time": "10h 3m 51s", "loss_scale": 1.0, "consumed_samples": 216576, "global_step/max_steps": "846/6362"} +{"lm loss": 5.22740555, "grad_norm": 1.09325349, "learning_rate": 9.818e-05, "elapsed_time_per_iteration": 6.57673478, "memory(GiB)": 21.51, "elapsed_time": "1h 32m 43s", "remaining_time": "10h 3m 45s", "loss_scale": 1.0, "consumed_samples": 216832, "global_step/max_steps": "847/6362"} +{"lm loss": 5.24478817, "grad_norm": 1.10068512, "learning_rate": 9.817e-05, "elapsed_time_per_iteration": 6.75965118, "memory(GiB)": 21.51, "elapsed_time": "1h 32m 50s", "remaining_time": "10h 3m 39s", "loss_scale": 1.0, "consumed_samples": 217088, "global_step/max_steps": "848/6362"} +{"lm loss": 5.25055313, "grad_norm": 1.01307249, "learning_rate": 9.816e-05, "elapsed_time_per_iteration": 6.62953734, "memory(GiB)": 21.51, "elapsed_time": "1h 32m 56s", "remaining_time": "10h 3m 33s", "loss_scale": 1.0, "consumed_samples": 217344, "global_step/max_steps": "849/6362"} +{"lm loss": 5.24909163, "grad_norm": 1.21907377, "learning_rate": 9.816e-05, "elapsed_time_per_iteration": 6.54285669, "memory(GiB)": 21.51, "elapsed_time": "1h 33m 3s", "remaining_time": "10h 3m 26s", "loss_scale": 1.0, "consumed_samples": 217600, "global_step/max_steps": "850/6362"} +{"lm loss": 5.25807428, "grad_norm": 1.05540299, "learning_rate": 9.815e-05, "elapsed_time_per_iteration": 6.6528275, "memory(GiB)": 21.51, "elapsed_time": "1h 33m 10s", "remaining_time": "10h 3m 20s", "loss_scale": 1.0, "consumed_samples": 217856, "global_step/max_steps": "851/6362"} +{"lm loss": 5.24465752, "grad_norm": 1.12236345, "learning_rate": 9.814e-05, "elapsed_time_per_iteration": 6.46667409, "memory(GiB)": 21.51, "elapsed_time": "1h 33m 16s", "remaining_time": "10h 3m 13s", "loss_scale": 1.0, "consumed_samples": 218112, "global_step/max_steps": "852/6362"} +{"lm loss": 5.24154282, "grad_norm": 1.06670475, "learning_rate": 9.814e-05, "elapsed_time_per_iteration": 6.36043048, "memory(GiB)": 21.51, "elapsed_time": "1h 33m 22s", "remaining_time": "10h 3m 5s", "loss_scale": 1.0, "consumed_samples": 218368, "global_step/max_steps": "853/6362"} +{"lm loss": 5.24097729, "grad_norm": 1.03324604, "learning_rate": 9.813e-05, "elapsed_time_per_iteration": 6.66955018, "memory(GiB)": 21.51, "elapsed_time": "1h 33m 29s", "remaining_time": "10h 2m 59s", "loss_scale": 1.0, "consumed_samples": 218624, "global_step/max_steps": "854/6362"} +{"lm loss": 5.24015045, "grad_norm": 1.28184402, "learning_rate": 9.812e-05, "elapsed_time_per_iteration": 6.38153148, "memory(GiB)": 21.51, "elapsed_time": "1h 33m 35s", "remaining_time": "10h 2m 51s", "loss_scale": 1.0, "consumed_samples": 218880, "global_step/max_steps": "855/6362"} +{"lm loss": 5.23305178, "grad_norm": 0.97618926, "learning_rate": 9.812e-05, "elapsed_time_per_iteration": 6.51729488, "memory(GiB)": 21.51, "elapsed_time": "1h 33m 42s", "remaining_time": "10h 2m 45s", "loss_scale": 1.0, "consumed_samples": 219136, "global_step/max_steps": "856/6362"} +{"lm loss": 5.23184443, "grad_norm": 1.20659983, "learning_rate": 9.811e-05, "elapsed_time_per_iteration": 6.53385139, "memory(GiB)": 21.51, "elapsed_time": "1h 33m 48s", "remaining_time": "10h 2m 38s", "loss_scale": 1.0, "consumed_samples": 219392, "global_step/max_steps": "857/6362"} +{"lm loss": 5.22996616, "grad_norm": 0.96163422, "learning_rate": 9.81e-05, "elapsed_time_per_iteration": 6.43494368, "memory(GiB)": 21.51, "elapsed_time": "1h 33m 55s", "remaining_time": "10h 2m 30s", "loss_scale": 1.0, "consumed_samples": 219648, "global_step/max_steps": "858/6362"} +{"lm loss": 5.22523165, "grad_norm": 1.10311747, "learning_rate": 9.81e-05, "elapsed_time_per_iteration": 6.67782331, "memory(GiB)": 21.51, "elapsed_time": "1h 34m 2s", "remaining_time": "10h 2m 24s", "loss_scale": 1.0, "consumed_samples": 219904, "global_step/max_steps": "859/6362"} +{"lm loss": 5.25619125, "grad_norm": 1.22784209, "learning_rate": 9.809e-05, "elapsed_time_per_iteration": 6.54409814, "memory(GiB)": 21.51, "elapsed_time": "1h 34m 8s", "remaining_time": "10h 2m 18s", "loss_scale": 1.0, "consumed_samples": 220160, "global_step/max_steps": "860/6362"} +{"lm loss": 5.25298738, "grad_norm": 1.03548038, "learning_rate": 9.808e-05, "elapsed_time_per_iteration": 6.54418111, "memory(GiB)": 21.51, "elapsed_time": "1h 34m 15s", "remaining_time": "10h 2m 11s", "loss_scale": 1.0, "consumed_samples": 220416, "global_step/max_steps": "861/6362"} +{"lm loss": 5.23088074, "grad_norm": 1.14282215, "learning_rate": 9.807e-05, "elapsed_time_per_iteration": 6.55451679, "memory(GiB)": 21.51, "elapsed_time": "1h 34m 21s", "remaining_time": "10h 2m 4s", "loss_scale": 1.0, "consumed_samples": 220672, "global_step/max_steps": "862/6362"} +{"lm loss": 5.24091482, "grad_norm": 1.0122757, "learning_rate": 9.807e-05, "elapsed_time_per_iteration": 6.44732451, "memory(GiB)": 21.51, "elapsed_time": "1h 34m 28s", "remaining_time": "10h 1m 57s", "loss_scale": 1.0, "consumed_samples": 220928, "global_step/max_steps": "863/6362"} +{"lm loss": 5.22955132, "grad_norm": 1.04977763, "learning_rate": 9.806e-05, "elapsed_time_per_iteration": 6.46375942, "memory(GiB)": 21.51, "elapsed_time": "1h 34m 34s", "remaining_time": "10h 1m 50s", "loss_scale": 1.0, "consumed_samples": 221184, "global_step/max_steps": "864/6362"} +{"lm loss": 5.22555161, "grad_norm": 0.99910831, "learning_rate": 9.805e-05, "elapsed_time_per_iteration": 6.61847806, "memory(GiB)": 21.51, "elapsed_time": "1h 34m 41s", "remaining_time": "10h 1m 44s", "loss_scale": 1.0, "consumed_samples": 221440, "global_step/max_steps": "865/6362"} +{"lm loss": 5.24531507, "grad_norm": 1.08723104, "learning_rate": 9.805e-05, "elapsed_time_per_iteration": 6.55290127, "memory(GiB)": 21.51, "elapsed_time": "1h 34m 47s", "remaining_time": "10h 1m 37s", "loss_scale": 1.0, "consumed_samples": 221696, "global_step/max_steps": "866/6362"} +{"lm loss": 5.21838331, "grad_norm": 1.05509245, "learning_rate": 9.804e-05, "elapsed_time_per_iteration": 6.59598064, "memory(GiB)": 21.51, "elapsed_time": "1h 34m 54s", "remaining_time": "10h 1m 31s", "loss_scale": 1.0, "consumed_samples": 221952, "global_step/max_steps": "867/6362"} +{"lm loss": 5.23929644, "grad_norm": 1.23421192, "learning_rate": 9.803e-05, "elapsed_time_per_iteration": 7.15815568, "memory(GiB)": 21.51, "elapsed_time": "1h 35m 1s", "remaining_time": "10h 1m 28s", "loss_scale": 1.0, "consumed_samples": 222208, "global_step/max_steps": "868/6362"} +{"lm loss": 5.23468781, "grad_norm": 0.91955847, "learning_rate": 9.803e-05, "elapsed_time_per_iteration": 6.72798562, "memory(GiB)": 21.51, "elapsed_time": "1h 35m 8s", "remaining_time": "10h 1m 22s", "loss_scale": 1.0, "consumed_samples": 222464, "global_step/max_steps": "869/6362"} +{"lm loss": 5.23050213, "grad_norm": 1.19196713, "learning_rate": 9.802e-05, "elapsed_time_per_iteration": 6.74232817, "memory(GiB)": 21.51, "elapsed_time": "1h 35m 15s", "remaining_time": "10h 1m 17s", "loss_scale": 1.0, "consumed_samples": 222720, "global_step/max_steps": "870/6362"} +{"lm loss": 5.24107313, "grad_norm": 1.25604308, "learning_rate": 9.801e-05, "elapsed_time_per_iteration": 6.65627265, "memory(GiB)": 21.51, "elapsed_time": "1h 35m 21s", "remaining_time": "10h 1m 11s", "loss_scale": 1.0, "consumed_samples": 222976, "global_step/max_steps": "871/6362"} +{"lm loss": 5.23096228, "grad_norm": 0.92563897, "learning_rate": 9.8e-05, "elapsed_time_per_iteration": 6.66413689, "memory(GiB)": 21.51, "elapsed_time": "1h 35m 28s", "remaining_time": "10h 1m 5s", "loss_scale": 1.0, "consumed_samples": 223232, "global_step/max_steps": "872/6362"} +{"lm loss": 5.25408268, "grad_norm": 1.22043049, "learning_rate": 9.8e-05, "elapsed_time_per_iteration": 6.63436389, "memory(GiB)": 21.51, "elapsed_time": "1h 35m 35s", "remaining_time": "10h 0m 58s", "loss_scale": 1.0, "consumed_samples": 223488, "global_step/max_steps": "873/6362"} +{"lm loss": 5.22664213, "grad_norm": 1.22442579, "learning_rate": 9.799e-05, "elapsed_time_per_iteration": 6.61794138, "memory(GiB)": 21.51, "elapsed_time": "1h 35m 41s", "remaining_time": "10h 0m 52s", "loss_scale": 1.0, "consumed_samples": 223744, "global_step/max_steps": "874/6362"} +{"lm loss": 5.21670485, "grad_norm": 0.99121481, "learning_rate": 9.798e-05, "elapsed_time_per_iteration": 6.58228064, "memory(GiB)": 21.51, "elapsed_time": "1h 35m 48s", "remaining_time": "10h 0m 46s", "loss_scale": 1.0, "consumed_samples": 224000, "global_step/max_steps": "875/6362"} +{"lm loss": 5.22328901, "grad_norm": 1.19405413, "learning_rate": 9.797e-05, "elapsed_time_per_iteration": 6.62893033, "memory(GiB)": 21.51, "elapsed_time": "1h 35m 54s", "remaining_time": "10h 0m 40s", "loss_scale": 1.0, "consumed_samples": 224256, "global_step/max_steps": "876/6362"} +{"lm loss": 5.22751379, "grad_norm": 0.85955787, "learning_rate": 9.797e-05, "elapsed_time_per_iteration": 6.67588496, "memory(GiB)": 21.51, "elapsed_time": "1h 36m 1s", "remaining_time": "10h 0m 34s", "loss_scale": 1.0, "consumed_samples": 224512, "global_step/max_steps": "877/6362"} +{"lm loss": 5.22086477, "grad_norm": 0.80398166, "learning_rate": 9.796e-05, "elapsed_time_per_iteration": 6.39637446, "memory(GiB)": 21.51, "elapsed_time": "1h 36m 7s", "remaining_time": "10h 0m 26s", "loss_scale": 1.0, "consumed_samples": 224768, "global_step/max_steps": "878/6362"} +{"lm loss": 5.20425463, "grad_norm": 0.95480925, "learning_rate": 9.795e-05, "elapsed_time_per_iteration": 6.60027003, "memory(GiB)": 21.51, "elapsed_time": "1h 36m 14s", "remaining_time": "10h 0m 20s", "loss_scale": 1.0, "consumed_samples": 225024, "global_step/max_steps": "879/6362"} +{"lm loss": 5.24340868, "grad_norm": 1.29121244, "learning_rate": 9.795e-05, "elapsed_time_per_iteration": 6.75101852, "memory(GiB)": 21.51, "elapsed_time": "1h 36m 21s", "remaining_time": "10h 0m 14s", "loss_scale": 1.0, "consumed_samples": 225280, "global_step/max_steps": "880/6362"} +{"lm loss": 5.21700907, "grad_norm": 0.87587553, "learning_rate": 9.794e-05, "elapsed_time_per_iteration": 6.57635379, "memory(GiB)": 21.51, "elapsed_time": "1h 36m 27s", "remaining_time": "10h 0m 8s", "loss_scale": 1.0, "consumed_samples": 225536, "global_step/max_steps": "881/6362"} +{"lm loss": 5.23745203, "grad_norm": 1.08133316, "learning_rate": 9.793e-05, "elapsed_time_per_iteration": 6.45343471, "memory(GiB)": 21.51, "elapsed_time": "1h 36m 34s", "remaining_time": "10h 0m 0s", "loss_scale": 1.0, "consumed_samples": 225792, "global_step/max_steps": "882/6362"} +{"lm loss": 5.22769308, "grad_norm": 1.23575151, "learning_rate": 9.792e-05, "elapsed_time_per_iteration": 7.2088809, "memory(GiB)": 21.51, "elapsed_time": "1h 36m 41s", "remaining_time": "9h 59m 58s", "loss_scale": 1.0, "consumed_samples": 226048, "global_step/max_steps": "883/6362"} +{"lm loss": 5.24126768, "grad_norm": 0.90311724, "learning_rate": 9.792e-05, "elapsed_time_per_iteration": 6.46489811, "memory(GiB)": 21.51, "elapsed_time": "1h 36m 47s", "remaining_time": "9h 59m 51s", "loss_scale": 1.0, "consumed_samples": 226304, "global_step/max_steps": "884/6362"} +{"lm loss": 5.24172688, "grad_norm": 1.14184225, "learning_rate": 9.791e-05, "elapsed_time_per_iteration": 6.37347627, "memory(GiB)": 21.51, "elapsed_time": "1h 36m 54s", "remaining_time": "9h 59m 43s", "loss_scale": 1.0, "consumed_samples": 226560, "global_step/max_steps": "885/6362"} +{"lm loss": 5.21328688, "grad_norm": 1.06798887, "learning_rate": 9.79e-05, "elapsed_time_per_iteration": 6.42769742, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 0s", "remaining_time": "9h 59m 35s", "loss_scale": 1.0, "consumed_samples": 226816, "global_step/max_steps": "886/6362"} +{"lm loss": 5.21351862, "grad_norm": 1.2308532, "learning_rate": 9.789e-05, "elapsed_time_per_iteration": 6.59165883, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 7s", "remaining_time": "9h 59m 29s", "loss_scale": 1.0, "consumed_samples": 227072, "global_step/max_steps": "887/6362"} +{"lm loss": 5.23438025, "grad_norm": 0.97489977, "learning_rate": 9.789e-05, "elapsed_time_per_iteration": 6.50651145, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 13s", "remaining_time": "9h 59m 22s", "loss_scale": 1.0, "consumed_samples": 227328, "global_step/max_steps": "888/6362"} +{"lm loss": 5.24187899, "grad_norm": 1.22569764, "learning_rate": 9.788e-05, "elapsed_time_per_iteration": 6.47581553, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 20s", "remaining_time": "9h 59m 15s", "loss_scale": 1.0, "consumed_samples": 227584, "global_step/max_steps": "889/6362"} +{"lm loss": 5.22890806, "grad_norm": 0.95319164, "learning_rate": 9.787e-05, "elapsed_time_per_iteration": 6.34752393, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 26s", "remaining_time": "9h 59m 7s", "loss_scale": 1.0, "consumed_samples": 227840, "global_step/max_steps": "890/6362"} +{"lm loss": 5.2364831, "grad_norm": 1.16236651, "learning_rate": 9.787e-05, "elapsed_time_per_iteration": 6.59887242, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 33s", "remaining_time": "9h 59m 0s", "loss_scale": 1.0, "consumed_samples": 228096, "global_step/max_steps": "891/6362"} +{"lm loss": 5.21784687, "grad_norm": 1.06956172, "learning_rate": 9.786e-05, "elapsed_time_per_iteration": 6.51332021, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 39s", "remaining_time": "9h 58m 54s", "loss_scale": 1.0, "consumed_samples": 228352, "global_step/max_steps": "892/6362"} +{"lm loss": 5.23176765, "grad_norm": 1.16449773, "learning_rate": 9.785e-05, "elapsed_time_per_iteration": 6.6300056, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 46s", "remaining_time": "9h 58m 47s", "loss_scale": 1.0, "consumed_samples": 228608, "global_step/max_steps": "893/6362"} +{"lm loss": 5.225811, "grad_norm": 0.91176206, "learning_rate": 9.784e-05, "elapsed_time_per_iteration": 6.57715464, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 53s", "remaining_time": "9h 58m 41s", "loss_scale": 1.0, "consumed_samples": 228864, "global_step/max_steps": "894/6362"} +{"lm loss": 5.23460674, "grad_norm": 0.93644005, "learning_rate": 9.784e-05, "elapsed_time_per_iteration": 6.5462513, "memory(GiB)": 21.51, "elapsed_time": "1h 37m 59s", "remaining_time": "9h 58m 34s", "loss_scale": 1.0, "consumed_samples": 229120, "global_step/max_steps": "895/6362"} +{"lm loss": 5.21303606, "grad_norm": 1.10560954, "learning_rate": 9.783e-05, "elapsed_time_per_iteration": 6.58180618, "memory(GiB)": 21.51, "elapsed_time": "1h 38m 6s", "remaining_time": "9h 58m 28s", "loss_scale": 1.0, "consumed_samples": 229376, "global_step/max_steps": "896/6362"} +{"lm loss": 5.22755098, "grad_norm": 1.28938711, "learning_rate": 9.782e-05, "elapsed_time_per_iteration": 6.43280149, "memory(GiB)": 21.51, "elapsed_time": "1h 38m 12s", "remaining_time": "9h 58m 20s", "loss_scale": 1.0, "consumed_samples": 229632, "global_step/max_steps": "897/6362"} +{"lm loss": 5.22390079, "grad_norm": 1.02525294, "learning_rate": 9.781e-05, "elapsed_time_per_iteration": 6.36786151, "memory(GiB)": 21.51, "elapsed_time": "1h 38m 18s", "remaining_time": "9h 58m 12s", "loss_scale": 1.0, "consumed_samples": 229888, "global_step/max_steps": "898/6362"} +{"lm loss": 5.22793865, "grad_norm": 1.12757277, "learning_rate": 9.781e-05, "elapsed_time_per_iteration": 6.67296433, "memory(GiB)": 21.51, "elapsed_time": "1h 38m 25s", "remaining_time": "9h 58m 6s", "loss_scale": 1.0, "consumed_samples": 230144, "global_step/max_steps": "899/6362"} +{"lm loss": 5.22142363, "grad_norm": 1.05185354, "learning_rate": 9.78e-05, "elapsed_time_per_iteration": 6.62496352, "memory(GiB)": 21.51, "elapsed_time": "1h 38m 32s", "remaining_time": "9h 58m 0s", "loss_scale": 1.0, "consumed_samples": 230400, "global_step/max_steps": "900/6362"} +{"lm loss": 5.23168325, "grad_norm": 1.11550641, "learning_rate": 9.779e-05, "elapsed_time_per_iteration": 6.53704786, "memory(GiB)": 21.51, "elapsed_time": "1h 38m 38s", "remaining_time": "9h 57m 53s", "loss_scale": 1.0, "consumed_samples": 230656, "global_step/max_steps": "901/6362"} +{"lm loss": 5.22902393, "grad_norm": 1.2702316, "learning_rate": 9.778e-05, "elapsed_time_per_iteration": 7.1861589, "memory(GiB)": 21.51, "elapsed_time": "1h 38m 45s", "remaining_time": "9h 57m 51s", "loss_scale": 1.0, "consumed_samples": 230912, "global_step/max_steps": "902/6362"} +{"lm loss": 5.20055389, "grad_norm": 0.93289214, "learning_rate": 9.778e-05, "elapsed_time_per_iteration": 6.46706247, "memory(GiB)": 21.51, "elapsed_time": "1h 38m 52s", "remaining_time": "9h 57m 43s", "loss_scale": 1.0, "consumed_samples": 231168, "global_step/max_steps": "903/6362"} +{"lm loss": 5.21125793, "grad_norm": 1.14858639, "learning_rate": 9.777e-05, "elapsed_time_per_iteration": 6.65202093, "memory(GiB)": 21.51, "elapsed_time": "1h 38m 59s", "remaining_time": "9h 57m 37s", "loss_scale": 1.0, "consumed_samples": 231424, "global_step/max_steps": "904/6362"} +{"lm loss": 5.22658777, "grad_norm": 1.06258333, "learning_rate": 9.776e-05, "elapsed_time_per_iteration": 6.43218446, "memory(GiB)": 21.51, "elapsed_time": "1h 39m 5s", "remaining_time": "9h 57m 30s", "loss_scale": 1.0, "consumed_samples": 231680, "global_step/max_steps": "905/6362"} +{"lm loss": 5.2127738, "grad_norm": 1.13568127, "learning_rate": 9.775e-05, "elapsed_time_per_iteration": 6.24975181, "memory(GiB)": 21.51, "elapsed_time": "1h 39m 11s", "remaining_time": "9h 57m 21s", "loss_scale": 1.0, "consumed_samples": 231936, "global_step/max_steps": "906/6362"} +{"lm loss": 5.2273488, "grad_norm": 1.19893634, "learning_rate": 9.775e-05, "elapsed_time_per_iteration": 6.30580902, "memory(GiB)": 21.51, "elapsed_time": "1h 39m 18s", "remaining_time": "9h 57m 13s", "loss_scale": 1.0, "consumed_samples": 232192, "global_step/max_steps": "907/6362"} +{"lm loss": 5.2241416, "grad_norm": 1.07758629, "learning_rate": 9.774e-05, "elapsed_time_per_iteration": 6.84301519, "memory(GiB)": 21.51, "elapsed_time": "1h 39m 24s", "remaining_time": "9h 57m 8s", "loss_scale": 1.0, "consumed_samples": 232448, "global_step/max_steps": "908/6362"} +{"lm loss": 5.20699739, "grad_norm": 1.0494082, "learning_rate": 9.773e-05, "elapsed_time_per_iteration": 6.64254928, "memory(GiB)": 21.51, "elapsed_time": "1h 39m 31s", "remaining_time": "9h 57m 2s", "loss_scale": 1.0, "consumed_samples": 232704, "global_step/max_steps": "909/6362"} +{"lm loss": 5.2077961, "grad_norm": 1.09234262, "learning_rate": 9.772e-05, "elapsed_time_per_iteration": 6.28653431, "memory(GiB)": 21.51, "elapsed_time": "1h 39m 37s", "remaining_time": "9h 56m 54s", "loss_scale": 1.0, "consumed_samples": 232960, "global_step/max_steps": "910/6362"} +{"lm loss": 5.2181983, "grad_norm": 1.30461884, "learning_rate": 9.771e-05, "elapsed_time_per_iteration": 6.37951827, "memory(GiB)": 21.51, "elapsed_time": "1h 39m 44s", "remaining_time": "9h 56m 46s", "loss_scale": 1.0, "consumed_samples": 233216, "global_step/max_steps": "911/6362"} +{"lm loss": 5.20133495, "grad_norm": 0.94666129, "learning_rate": 9.771e-05, "elapsed_time_per_iteration": 6.58270955, "memory(GiB)": 21.51, "elapsed_time": "1h 39m 50s", "remaining_time": "9h 56m 40s", "loss_scale": 1.0, "consumed_samples": 233472, "global_step/max_steps": "912/6362"} +{"lm loss": 5.23459911, "grad_norm": 1.23430276, "learning_rate": 9.77e-05, "elapsed_time_per_iteration": 6.5009594, "memory(GiB)": 21.51, "elapsed_time": "1h 39m 57s", "remaining_time": "9h 56m 33s", "loss_scale": 1.0, "consumed_samples": 233728, "global_step/max_steps": "913/6362"} +{"lm loss": 5.21751881, "grad_norm": 0.97538543, "learning_rate": 9.769e-05, "elapsed_time_per_iteration": 6.62538433, "memory(GiB)": 21.51, "elapsed_time": "1h 40m 3s", "remaining_time": "9h 56m 27s", "loss_scale": 1.0, "consumed_samples": 233984, "global_step/max_steps": "914/6362"} +{"lm loss": 5.21998644, "grad_norm": 0.99040002, "learning_rate": 9.768e-05, "elapsed_time_per_iteration": 6.47780108, "memory(GiB)": 21.51, "elapsed_time": "1h 40m 10s", "remaining_time": "9h 56m 19s", "loss_scale": 1.0, "consumed_samples": 234240, "global_step/max_steps": "915/6362"} +{"lm loss": 5.24816847, "grad_norm": 0.97121531, "learning_rate": 9.768e-05, "elapsed_time_per_iteration": 6.50989509, "memory(GiB)": 21.51, "elapsed_time": "1h 40m 16s", "remaining_time": "9h 56m 13s", "loss_scale": 1.0, "consumed_samples": 234496, "global_step/max_steps": "916/6362"} +{"lm loss": 5.2053504, "grad_norm": 1.10990822, "learning_rate": 9.767e-05, "elapsed_time_per_iteration": 6.41055036, "memory(GiB)": 21.51, "elapsed_time": "1h 40m 23s", "remaining_time": "9h 56m 5s", "loss_scale": 1.0, "consumed_samples": 234752, "global_step/max_steps": "917/6362"} +{"lm loss": 5.21419334, "grad_norm": 1.01801145, "learning_rate": 9.766e-05, "elapsed_time_per_iteration": 6.4227047, "memory(GiB)": 21.51, "elapsed_time": "1h 40m 29s", "remaining_time": "9h 55m 58s", "loss_scale": 1.0, "consumed_samples": 235008, "global_step/max_steps": "918/6362"} +{"lm loss": 5.20421171, "grad_norm": 0.99299937, "learning_rate": 9.765e-05, "elapsed_time_per_iteration": 6.6370337, "memory(GiB)": 21.51, "elapsed_time": "1h 40m 36s", "remaining_time": "9h 55m 51s", "loss_scale": 1.0, "consumed_samples": 235264, "global_step/max_steps": "919/6362"} +{"lm loss": 5.22563267, "grad_norm": 0.99961466, "learning_rate": 9.765e-05, "elapsed_time_per_iteration": 6.53285384, "memory(GiB)": 21.51, "elapsed_time": "1h 40m 42s", "remaining_time": "9h 55m 45s", "loss_scale": 1.0, "consumed_samples": 235520, "global_step/max_steps": "920/6362"} +{"lm loss": 5.2244029, "grad_norm": 1.10048616, "learning_rate": 9.764e-05, "elapsed_time_per_iteration": 6.71577811, "memory(GiB)": 21.51, "elapsed_time": "1h 40m 49s", "remaining_time": "9h 55m 39s", "loss_scale": 1.0, "consumed_samples": 235776, "global_step/max_steps": "921/6362"} +{"lm loss": 5.22195101, "grad_norm": 0.9757843, "learning_rate": 9.763e-05, "elapsed_time_per_iteration": 6.59047771, "memory(GiB)": 21.51, "elapsed_time": "1h 40m 56s", "remaining_time": "9h 55m 33s", "loss_scale": 1.0, "consumed_samples": 236032, "global_step/max_steps": "922/6362"} +{"lm loss": 5.21378422, "grad_norm": 1.17234731, "learning_rate": 9.762e-05, "elapsed_time_per_iteration": 6.63938642, "memory(GiB)": 21.51, "elapsed_time": "1h 41m 2s", "remaining_time": "9h 55m 26s", "loss_scale": 1.0, "consumed_samples": 236288, "global_step/max_steps": "923/6362"} +{"lm loss": 5.21475983, "grad_norm": 0.78227305, "learning_rate": 9.761e-05, "elapsed_time_per_iteration": 6.45466757, "memory(GiB)": 21.51, "elapsed_time": "1h 41m 9s", "remaining_time": "9h 55m 19s", "loss_scale": 1.0, "consumed_samples": 236544, "global_step/max_steps": "924/6362"} +{"lm loss": 5.20025635, "grad_norm": 1.0059309, "learning_rate": 9.761e-05, "elapsed_time_per_iteration": 6.51963329, "memory(GiB)": 21.51, "elapsed_time": "1h 41m 15s", "remaining_time": "9h 55m 12s", "loss_scale": 1.0, "consumed_samples": 236800, "global_step/max_steps": "925/6362"} +{"lm loss": 5.20212603, "grad_norm": 1.27890182, "learning_rate": 9.76e-05, "elapsed_time_per_iteration": 6.7635653, "memory(GiB)": 21.51, "elapsed_time": "1h 41m 22s", "remaining_time": "9h 55m 7s", "loss_scale": 1.0, "consumed_samples": 237056, "global_step/max_steps": "926/6362"} +{"lm loss": 5.20806646, "grad_norm": 0.93361253, "learning_rate": 9.759e-05, "elapsed_time_per_iteration": 6.4138062, "memory(GiB)": 21.51, "elapsed_time": "1h 41m 29s", "remaining_time": "9h 54m 59s", "loss_scale": 1.0, "consumed_samples": 237312, "global_step/max_steps": "927/6362"} +{"lm loss": 5.23006248, "grad_norm": 1.15810466, "learning_rate": 9.758e-05, "elapsed_time_per_iteration": 6.92971158, "memory(GiB)": 21.51, "elapsed_time": "1h 41m 35s", "remaining_time": "9h 54m 55s", "loss_scale": 1.0, "consumed_samples": 237568, "global_step/max_steps": "928/6362"} +{"lm loss": 5.21607113, "grad_norm": 1.07590675, "learning_rate": 9.758e-05, "elapsed_time_per_iteration": 6.7846694, "memory(GiB)": 21.51, "elapsed_time": "1h 41m 42s", "remaining_time": "9h 54m 50s", "loss_scale": 1.0, "consumed_samples": 237824, "global_step/max_steps": "929/6362"} +{"lm loss": 5.20849037, "grad_norm": 1.25097775, "learning_rate": 9.757e-05, "elapsed_time_per_iteration": 6.84312892, "memory(GiB)": 21.51, "elapsed_time": "1h 41m 49s", "remaining_time": "9h 54m 45s", "loss_scale": 1.0, "consumed_samples": 238080, "global_step/max_steps": "930/6362"} +{"lm loss": 5.21696806, "grad_norm": 0.8194176, "learning_rate": 9.756e-05, "elapsed_time_per_iteration": 6.57199478, "memory(GiB)": 21.51, "elapsed_time": "1h 41m 56s", "remaining_time": "9h 54m 38s", "loss_scale": 1.0, "consumed_samples": 238336, "global_step/max_steps": "931/6362"} +{"lm loss": 5.21850586, "grad_norm": 0.97808558, "learning_rate": 9.755e-05, "elapsed_time_per_iteration": 6.59007215, "memory(GiB)": 21.51, "elapsed_time": "1h 42m 2s", "remaining_time": "9h 54m 32s", "loss_scale": 1.0, "consumed_samples": 238592, "global_step/max_steps": "932/6362"} +{"lm loss": 5.21591806, "grad_norm": 1.33542657, "learning_rate": 9.754e-05, "elapsed_time_per_iteration": 6.65906549, "memory(GiB)": 21.51, "elapsed_time": "1h 42m 9s", "remaining_time": "9h 54m 26s", "loss_scale": 1.0, "consumed_samples": 238848, "global_step/max_steps": "933/6362"} +{"lm loss": 5.20893955, "grad_norm": 0.94437456, "learning_rate": 9.754e-05, "elapsed_time_per_iteration": 6.73821998, "memory(GiB)": 21.51, "elapsed_time": "1h 42m 16s", "remaining_time": "9h 54m 20s", "loss_scale": 1.0, "consumed_samples": 239104, "global_step/max_steps": "934/6362"} +{"lm loss": 5.20111847, "grad_norm": 1.27397084, "learning_rate": 9.753e-05, "elapsed_time_per_iteration": 6.6182313, "memory(GiB)": 21.51, "elapsed_time": "1h 42m 22s", "remaining_time": "9h 54m 14s", "loss_scale": 1.0, "consumed_samples": 239360, "global_step/max_steps": "935/6362"} +{"lm loss": 5.21027517, "grad_norm": 0.9728272, "learning_rate": 9.752e-05, "elapsed_time_per_iteration": 6.65752506, "memory(GiB)": 21.51, "elapsed_time": "1h 42m 29s", "remaining_time": "9h 54m 8s", "loss_scale": 1.0, "consumed_samples": 239616, "global_step/max_steps": "936/6362"} +{"lm loss": 5.21895409, "grad_norm": 1.13208973, "learning_rate": 9.751e-05, "elapsed_time_per_iteration": 6.61461759, "memory(GiB)": 21.51, "elapsed_time": "1h 42m 36s", "remaining_time": "9h 54m 1s", "loss_scale": 1.0, "consumed_samples": 239872, "global_step/max_steps": "937/6362"} +{"lm loss": 5.20033884, "grad_norm": 1.09753537, "learning_rate": 9.75e-05, "elapsed_time_per_iteration": 6.79721236, "memory(GiB)": 21.51, "elapsed_time": "1h 42m 42s", "remaining_time": "9h 53m 56s", "loss_scale": 1.0, "consumed_samples": 240128, "global_step/max_steps": "938/6362"} +{"lm loss": 5.20449543, "grad_norm": 1.18962455, "learning_rate": 9.75e-05, "elapsed_time_per_iteration": 6.78062367, "memory(GiB)": 21.51, "elapsed_time": "1h 42m 49s", "remaining_time": "9h 53m 51s", "loss_scale": 1.0, "consumed_samples": 240384, "global_step/max_steps": "939/6362"} +{"lm loss": 5.21441555, "grad_norm": 0.91935217, "learning_rate": 9.749e-05, "elapsed_time_per_iteration": 6.83946872, "memory(GiB)": 21.51, "elapsed_time": "1h 42m 56s", "remaining_time": "9h 53m 46s", "loss_scale": 1.0, "consumed_samples": 240640, "global_step/max_steps": "940/6362"} +{"lm loss": 5.20710945, "grad_norm": 0.85903156, "learning_rate": 9.748e-05, "elapsed_time_per_iteration": 6.49269032, "memory(GiB)": 21.51, "elapsed_time": "1h 43m 2s", "remaining_time": "9h 53m 39s", "loss_scale": 1.0, "consumed_samples": 240896, "global_step/max_steps": "941/6362"} +{"lm loss": 5.21978235, "grad_norm": 0.91427702, "learning_rate": 9.747e-05, "elapsed_time_per_iteration": 6.58183146, "memory(GiB)": 21.51, "elapsed_time": "1h 43m 9s", "remaining_time": "9h 53m 32s", "loss_scale": 1.0, "consumed_samples": 241152, "global_step/max_steps": "942/6362"} +{"lm loss": 5.18784332, "grad_norm": 1.29872286, "learning_rate": 9.746e-05, "elapsed_time_per_iteration": 6.71318603, "memory(GiB)": 21.51, "elapsed_time": "1h 43m 16s", "remaining_time": "9h 53m 26s", "loss_scale": 1.0, "consumed_samples": 241408, "global_step/max_steps": "943/6362"} +{"lm loss": 5.20484209, "grad_norm": 0.97485435, "learning_rate": 9.746e-05, "elapsed_time_per_iteration": 6.6459868, "memory(GiB)": 21.51, "elapsed_time": "1h 43m 22s", "remaining_time": "9h 53m 20s", "loss_scale": 1.0, "consumed_samples": 241664, "global_step/max_steps": "944/6362"} +{"lm loss": 5.23273468, "grad_norm": 1.12212598, "learning_rate": 9.745e-05, "elapsed_time_per_iteration": 6.61250544, "memory(GiB)": 21.51, "elapsed_time": "1h 43m 29s", "remaining_time": "9h 53m 14s", "loss_scale": 1.0, "consumed_samples": 241920, "global_step/max_steps": "945/6362"} +{"lm loss": 5.2006197, "grad_norm": 0.99915624, "learning_rate": 9.744e-05, "elapsed_time_per_iteration": 6.6664238, "memory(GiB)": 21.51, "elapsed_time": "1h 43m 36s", "remaining_time": "9h 53m 8s", "loss_scale": 1.0, "consumed_samples": 242176, "global_step/max_steps": "946/6362"} +{"lm loss": 5.22160959, "grad_norm": 0.91272306, "learning_rate": 9.743e-05, "elapsed_time_per_iteration": 6.61808252, "memory(GiB)": 21.51, "elapsed_time": "1h 43m 42s", "remaining_time": "9h 53m 2s", "loss_scale": 1.0, "consumed_samples": 242432, "global_step/max_steps": "947/6362"} +{"lm loss": 5.21459293, "grad_norm": 1.12483835, "learning_rate": 9.742e-05, "elapsed_time_per_iteration": 6.79001236, "memory(GiB)": 21.51, "elapsed_time": "1h 43m 49s", "remaining_time": "9h 52m 56s", "loss_scale": 1.0, "consumed_samples": 242688, "global_step/max_steps": "948/6362"} +{"lm loss": 5.2083025, "grad_norm": 1.28441131, "learning_rate": 9.742e-05, "elapsed_time_per_iteration": 6.68945456, "memory(GiB)": 21.51, "elapsed_time": "1h 43m 56s", "remaining_time": "9h 52m 50s", "loss_scale": 1.0, "consumed_samples": 242944, "global_step/max_steps": "949/6362"} +{"lm loss": 5.19514513, "grad_norm": 0.79844737, "learning_rate": 9.741e-05, "elapsed_time_per_iteration": 6.86655641, "memory(GiB)": 21.51, "elapsed_time": "1h 44m 3s", "remaining_time": "9h 52m 46s", "loss_scale": 1.0, "consumed_samples": 243200, "global_step/max_steps": "950/6362"} +{"lm loss": 5.19379187, "grad_norm": 0.91153294, "learning_rate": 9.74e-05, "elapsed_time_per_iteration": 6.7649529, "memory(GiB)": 21.51, "elapsed_time": "1h 44m 9s", "remaining_time": "9h 52m 40s", "loss_scale": 1.0, "consumed_samples": 243456, "global_step/max_steps": "951/6362"} +{"lm loss": 5.20716095, "grad_norm": 1.10229743, "learning_rate": 9.739e-05, "elapsed_time_per_iteration": 6.42830467, "memory(GiB)": 21.51, "elapsed_time": "1h 44m 16s", "remaining_time": "9h 52m 33s", "loss_scale": 1.0, "consumed_samples": 243712, "global_step/max_steps": "952/6362"} +{"lm loss": 5.19765615, "grad_norm": 1.29426432, "learning_rate": 9.738e-05, "elapsed_time_per_iteration": 6.70715761, "memory(GiB)": 21.51, "elapsed_time": "1h 44m 23s", "remaining_time": "9h 52m 27s", "loss_scale": 1.0, "consumed_samples": 243968, "global_step/max_steps": "953/6362"} +{"lm loss": 5.21570015, "grad_norm": 0.92300946, "learning_rate": 9.737e-05, "elapsed_time_per_iteration": 6.49158311, "memory(GiB)": 21.51, "elapsed_time": "1h 44m 29s", "remaining_time": "9h 52m 20s", "loss_scale": 1.0, "consumed_samples": 244224, "global_step/max_steps": "954/6362"} +{"lm loss": 5.20590067, "grad_norm": 1.06225479, "learning_rate": 9.737e-05, "elapsed_time_per_iteration": 6.64876819, "memory(GiB)": 21.51, "elapsed_time": "1h 44m 36s", "remaining_time": "9h 52m 14s", "loss_scale": 1.0, "consumed_samples": 244480, "global_step/max_steps": "955/6362"} +{"lm loss": 5.22200489, "grad_norm": 1.04008067, "learning_rate": 9.736e-05, "elapsed_time_per_iteration": 6.42104435, "memory(GiB)": 21.51, "elapsed_time": "1h 44m 42s", "remaining_time": "9h 52m 6s", "loss_scale": 1.0, "consumed_samples": 244736, "global_step/max_steps": "956/6362"} +{"lm loss": 5.21344614, "grad_norm": 1.01338792, "learning_rate": 9.735e-05, "elapsed_time_per_iteration": 6.37511396, "memory(GiB)": 21.51, "elapsed_time": "1h 44m 48s", "remaining_time": "9h 51m 59s", "loss_scale": 1.0, "consumed_samples": 244992, "global_step/max_steps": "957/6362"} +{"lm loss": 5.19251919, "grad_norm": 1.26974118, "learning_rate": 9.734e-05, "elapsed_time_per_iteration": 6.60488701, "memory(GiB)": 21.51, "elapsed_time": "1h 44m 55s", "remaining_time": "9h 51m 52s", "loss_scale": 1.0, "consumed_samples": 245248, "global_step/max_steps": "958/6362"} +{"lm loss": 5.21450472, "grad_norm": 0.86126316, "learning_rate": 9.733e-05, "elapsed_time_per_iteration": 6.56335068, "memory(GiB)": 21.51, "elapsed_time": "1h 45m 2s", "remaining_time": "9h 51m 46s", "loss_scale": 1.0, "consumed_samples": 245504, "global_step/max_steps": "959/6362"} +{"lm loss": 5.19483471, "grad_norm": 1.05407286, "learning_rate": 9.733e-05, "elapsed_time_per_iteration": 6.63371992, "memory(GiB)": 21.51, "elapsed_time": "1h 45m 8s", "remaining_time": "9h 51m 39s", "loss_scale": 1.0, "consumed_samples": 245760, "global_step/max_steps": "960/6362"} +{"lm loss": 5.22408342, "grad_norm": 0.97701532, "learning_rate": 9.732e-05, "elapsed_time_per_iteration": 6.65976596, "memory(GiB)": 21.51, "elapsed_time": "1h 45m 15s", "remaining_time": "9h 51m 33s", "loss_scale": 1.0, "consumed_samples": 246016, "global_step/max_steps": "961/6362"} +{"lm loss": 5.23236609, "grad_norm": 1.06841218, "learning_rate": 9.731e-05, "elapsed_time_per_iteration": 6.49128318, "memory(GiB)": 21.51, "elapsed_time": "1h 45m 21s", "remaining_time": "9h 51m 26s", "loss_scale": 1.0, "consumed_samples": 246272, "global_step/max_steps": "962/6362"} +{"lm loss": 5.18747473, "grad_norm": 0.97660595, "learning_rate": 9.73e-05, "elapsed_time_per_iteration": 6.56415701, "memory(GiB)": 21.51, "elapsed_time": "1h 45m 28s", "remaining_time": "9h 51m 20s", "loss_scale": 1.0, "consumed_samples": 246528, "global_step/max_steps": "963/6362"} +{"lm loss": 5.20219707, "grad_norm": 1.17031884, "learning_rate": 9.729e-05, "elapsed_time_per_iteration": 6.49852419, "memory(GiB)": 21.51, "elapsed_time": "1h 45m 34s", "remaining_time": "9h 51m 13s", "loss_scale": 1.0, "consumed_samples": 246784, "global_step/max_steps": "964/6362"} +{"lm loss": 5.18873453, "grad_norm": 1.10760438, "learning_rate": 9.728e-05, "elapsed_time_per_iteration": 6.41887617, "memory(GiB)": 21.51, "elapsed_time": "1h 45m 41s", "remaining_time": "9h 51m 5s", "loss_scale": 1.0, "consumed_samples": 247040, "global_step/max_steps": "965/6362"} +{"lm loss": 5.2304039, "grad_norm": 0.96338338, "learning_rate": 9.728e-05, "elapsed_time_per_iteration": 6.27020955, "memory(GiB)": 21.51, "elapsed_time": "1h 45m 47s", "remaining_time": "9h 50m 57s", "loss_scale": 1.0, "consumed_samples": 247296, "global_step/max_steps": "966/6362"} +{"lm loss": 5.19372272, "grad_norm": 0.88463324, "learning_rate": 9.727e-05, "elapsed_time_per_iteration": 6.48067498, "memory(GiB)": 21.51, "elapsed_time": "1h 45m 54s", "remaining_time": "9h 50m 50s", "loss_scale": 1.0, "consumed_samples": 247552, "global_step/max_steps": "967/6362"} +{"lm loss": 5.21161938, "grad_norm": 0.99367565, "learning_rate": 9.726e-05, "elapsed_time_per_iteration": 6.47959232, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 0s", "remaining_time": "9h 50m 43s", "loss_scale": 1.0, "consumed_samples": 247808, "global_step/max_steps": "968/6362"} +{"lm loss": 5.1984024, "grad_norm": 1.22576487, "learning_rate": 9.725e-05, "elapsed_time_per_iteration": 6.6421721, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 7s", "remaining_time": "9h 50m 37s", "loss_scale": 1.0, "consumed_samples": 248064, "global_step/max_steps": "969/6362"} +{"lm loss": 5.20466661, "grad_norm": 0.86820441, "learning_rate": 9.724e-05, "elapsed_time_per_iteration": 6.68998528, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 13s", "remaining_time": "9h 50m 31s", "loss_scale": 1.0, "consumed_samples": 248320, "global_step/max_steps": "970/6362"} +{"lm loss": 5.19555426, "grad_norm": 1.11500657, "learning_rate": 9.723e-05, "elapsed_time_per_iteration": 6.67520046, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 20s", "remaining_time": "9h 50m 25s", "loss_scale": 1.0, "consumed_samples": 248576, "global_step/max_steps": "971/6362"} +{"lm loss": 5.21397781, "grad_norm": 1.15048528, "learning_rate": 9.723e-05, "elapsed_time_per_iteration": 6.56090569, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 27s", "remaining_time": "9h 50m 18s", "loss_scale": 1.0, "consumed_samples": 248832, "global_step/max_steps": "972/6362"} +{"lm loss": 5.20915079, "grad_norm": 1.0140363, "learning_rate": 9.722e-05, "elapsed_time_per_iteration": 6.53165889, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 33s", "remaining_time": "9h 50m 11s", "loss_scale": 1.0, "consumed_samples": 249088, "global_step/max_steps": "973/6362"} +{"lm loss": 5.20207644, "grad_norm": 1.08498895, "learning_rate": 9.721e-05, "elapsed_time_per_iteration": 6.48742127, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 40s", "remaining_time": "9h 50m 4s", "loss_scale": 1.0, "consumed_samples": 249344, "global_step/max_steps": "974/6362"} +{"lm loss": 5.20841503, "grad_norm": 0.98218584, "learning_rate": 9.72e-05, "elapsed_time_per_iteration": 6.6883328, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 46s", "remaining_time": "9h 49m 58s", "loss_scale": 1.0, "consumed_samples": 249600, "global_step/max_steps": "975/6362"} +{"lm loss": 5.18748713, "grad_norm": 0.91478813, "learning_rate": 9.719e-05, "elapsed_time_per_iteration": 6.63807917, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 53s", "remaining_time": "9h 49m 52s", "loss_scale": 1.0, "consumed_samples": 249856, "global_step/max_steps": "976/6362"} +{"lm loss": 5.20949125, "grad_norm": 0.95977104, "learning_rate": 9.718e-05, "elapsed_time_per_iteration": 6.35997486, "memory(GiB)": 21.51, "elapsed_time": "1h 46m 59s", "remaining_time": "9h 49m 44s", "loss_scale": 1.0, "consumed_samples": 250112, "global_step/max_steps": "977/6362"} +{"lm loss": 5.21003914, "grad_norm": 0.99131709, "learning_rate": 9.717e-05, "elapsed_time_per_iteration": 6.49518466, "memory(GiB)": 21.51, "elapsed_time": "1h 47m 6s", "remaining_time": "9h 49m 38s", "loss_scale": 1.0, "consumed_samples": 250368, "global_step/max_steps": "978/6362"} +{"lm loss": 5.21492958, "grad_norm": 1.19168949, "learning_rate": 9.717e-05, "elapsed_time_per_iteration": 6.42362952, "memory(GiB)": 21.51, "elapsed_time": "1h 47m 12s", "remaining_time": "9h 49m 30s", "loss_scale": 1.0, "consumed_samples": 250624, "global_step/max_steps": "979/6362"} +{"lm loss": 5.18529272, "grad_norm": 1.05549407, "learning_rate": 9.716e-05, "elapsed_time_per_iteration": 6.50126839, "memory(GiB)": 21.51, "elapsed_time": "1h 47m 19s", "remaining_time": "9h 49m 23s", "loss_scale": 1.0, "consumed_samples": 250880, "global_step/max_steps": "980/6362"} +{"lm loss": 5.20504665, "grad_norm": 0.98824614, "learning_rate": 9.715e-05, "elapsed_time_per_iteration": 6.39268064, "memory(GiB)": 21.51, "elapsed_time": "1h 47m 25s", "remaining_time": "9h 49m 16s", "loss_scale": 1.0, "consumed_samples": 251136, "global_step/max_steps": "981/6362"} +{"lm loss": 5.19016171, "grad_norm": 0.92436785, "learning_rate": 9.714e-05, "elapsed_time_per_iteration": 6.41511965, "memory(GiB)": 21.51, "elapsed_time": "1h 47m 32s", "remaining_time": "9h 49m 8s", "loss_scale": 1.0, "consumed_samples": 251392, "global_step/max_steps": "982/6362"} +{"lm loss": 5.20581579, "grad_norm": 0.87694716, "learning_rate": 9.713e-05, "elapsed_time_per_iteration": 6.40609741, "memory(GiB)": 21.51, "elapsed_time": "1h 47m 38s", "remaining_time": "9h 49m 1s", "loss_scale": 1.0, "consumed_samples": 251648, "global_step/max_steps": "983/6362"} +{"lm loss": 5.23416138, "grad_norm": 0.81093717, "learning_rate": 9.712e-05, "elapsed_time_per_iteration": 6.59254694, "memory(GiB)": 21.51, "elapsed_time": "1h 47m 45s", "remaining_time": "9h 48m 54s", "loss_scale": 1.0, "consumed_samples": 251904, "global_step/max_steps": "984/6362"} +{"lm loss": 5.20602179, "grad_norm": 0.83185929, "learning_rate": 9.712e-05, "elapsed_time_per_iteration": 6.59490705, "memory(GiB)": 21.51, "elapsed_time": "1h 47m 51s", "remaining_time": "9h 48m 48s", "loss_scale": 1.0, "consumed_samples": 252160, "global_step/max_steps": "985/6362"} +{"lm loss": 5.20236969, "grad_norm": 1.04056442, "learning_rate": 9.711e-05, "elapsed_time_per_iteration": 6.64702415, "memory(GiB)": 21.51, "elapsed_time": "1h 47m 58s", "remaining_time": "9h 48m 42s", "loss_scale": 1.0, "consumed_samples": 252416, "global_step/max_steps": "986/6362"} +{"lm loss": 5.20488644, "grad_norm": 1.26569259, "learning_rate": 9.71e-05, "elapsed_time_per_iteration": 6.63772917, "memory(GiB)": 21.51, "elapsed_time": "1h 48m 5s", "remaining_time": "9h 48m 35s", "loss_scale": 1.0, "consumed_samples": 252672, "global_step/max_steps": "987/6362"} +{"lm loss": 5.20116806, "grad_norm": 1.03323746, "learning_rate": 9.709e-05, "elapsed_time_per_iteration": 6.51945305, "memory(GiB)": 21.51, "elapsed_time": "1h 48m 11s", "remaining_time": "9h 48m 29s", "loss_scale": 1.0, "consumed_samples": 252928, "global_step/max_steps": "988/6362"} +{"lm loss": 5.21470022, "grad_norm": 1.13879943, "learning_rate": 9.708e-05, "elapsed_time_per_iteration": 6.49526811, "memory(GiB)": 21.51, "elapsed_time": "1h 48m 18s", "remaining_time": "9h 48m 22s", "loss_scale": 1.0, "consumed_samples": 253184, "global_step/max_steps": "989/6362"} +{"lm loss": 5.19195843, "grad_norm": 1.00229728, "learning_rate": 9.707e-05, "elapsed_time_per_iteration": 6.51050282, "memory(GiB)": 21.51, "elapsed_time": "1h 48m 24s", "remaining_time": "9h 48m 15s", "loss_scale": 1.0, "consumed_samples": 253440, "global_step/max_steps": "990/6362"} +{"lm loss": 5.17616749, "grad_norm": 1.00435328, "learning_rate": 9.706e-05, "elapsed_time_per_iteration": 6.62490296, "memory(GiB)": 21.51, "elapsed_time": "1h 48m 31s", "remaining_time": "9h 48m 8s", "loss_scale": 1.0, "consumed_samples": 253696, "global_step/max_steps": "991/6362"} +{"lm loss": 5.21280003, "grad_norm": 0.89845532, "learning_rate": 9.705e-05, "elapsed_time_per_iteration": 6.41137171, "memory(GiB)": 21.51, "elapsed_time": "1h 48m 37s", "remaining_time": "9h 48m 1s", "loss_scale": 1.0, "consumed_samples": 253952, "global_step/max_steps": "992/6362"} +{"lm loss": 5.19932032, "grad_norm": 0.94870698, "learning_rate": 9.705e-05, "elapsed_time_per_iteration": 6.3863008, "memory(GiB)": 21.51, "elapsed_time": "1h 48m 43s", "remaining_time": "9h 47m 54s", "loss_scale": 1.0, "consumed_samples": 254208, "global_step/max_steps": "993/6362"} +{"lm loss": 5.20095158, "grad_norm": 1.07802665, "learning_rate": 9.704e-05, "elapsed_time_per_iteration": 6.6479857, "memory(GiB)": 21.51, "elapsed_time": "1h 48m 50s", "remaining_time": "9h 47m 47s", "loss_scale": 1.0, "consumed_samples": 254464, "global_step/max_steps": "994/6362"} +{"lm loss": 5.18976593, "grad_norm": 1.0075475, "learning_rate": 9.703e-05, "elapsed_time_per_iteration": 6.82834935, "memory(GiB)": 21.51, "elapsed_time": "1h 48m 57s", "remaining_time": "9h 47m 42s", "loss_scale": 1.0, "consumed_samples": 254720, "global_step/max_steps": "995/6362"} +{"lm loss": 5.20744038, "grad_norm": 0.97101319, "learning_rate": 9.702e-05, "elapsed_time_per_iteration": 6.79285455, "memory(GiB)": 21.51, "elapsed_time": "1h 49m 4s", "remaining_time": "9h 47m 37s", "loss_scale": 1.0, "consumed_samples": 254976, "global_step/max_steps": "996/6362"} +{"lm loss": 5.18778849, "grad_norm": 1.01483965, "learning_rate": 9.701e-05, "elapsed_time_per_iteration": 6.64510369, "memory(GiB)": 21.51, "elapsed_time": "1h 49m 10s", "remaining_time": "9h 47m 31s", "loss_scale": 1.0, "consumed_samples": 255232, "global_step/max_steps": "997/6362"} +{"lm loss": 5.21771765, "grad_norm": 1.25782835, "learning_rate": 9.7e-05, "elapsed_time_per_iteration": 6.55261588, "memory(GiB)": 21.51, "elapsed_time": "1h 49m 17s", "remaining_time": "9h 47m 24s", "loss_scale": 1.0, "consumed_samples": 255488, "global_step/max_steps": "998/6362"} +{"lm loss": 5.21171188, "grad_norm": 1.09344745, "learning_rate": 9.699e-05, "elapsed_time_per_iteration": 6.4342711, "memory(GiB)": 21.51, "elapsed_time": "1h 49m 23s", "remaining_time": "9h 47m 17s", "loss_scale": 1.0, "consumed_samples": 255744, "global_step/max_steps": "999/6362"} +{"lm loss": 5.18960094, "grad_norm": 1.06561553, "learning_rate": 9.699e-05, "elapsed_time_per_iteration": 6.61554027, "memory(GiB)": 21.51, "elapsed_time": "1h 49m 30s", "remaining_time": "9h 47m 10s", "loss_scale": 1.0, "consumed_samples": 256000, "global_step/max_steps": "1000/6362"} +{"lm loss": 5.17940855, "grad_norm": 1.01993763, "learning_rate": 9.698e-05, "elapsed_time_per_iteration": 6.66243672, "memory(GiB)": 21.51, "elapsed_time": "1h 49m 37s", "remaining_time": "9h 47m 4s", "loss_scale": 1.0, "consumed_samples": 256256, "global_step/max_steps": "1001/6362"} +{"lm loss": 5.19052553, "grad_norm": 0.92221093, "learning_rate": 9.697e-05, "elapsed_time_per_iteration": 6.39158535, "memory(GiB)": 21.51, "elapsed_time": "1h 49m 43s", "remaining_time": "9h 46m 57s", "loss_scale": 1.0, "consumed_samples": 256512, "global_step/max_steps": "1002/6362"} +{"lm loss": 5.19387531, "grad_norm": 0.98734486, "learning_rate": 9.696e-05, "elapsed_time_per_iteration": 6.55628467, "memory(GiB)": 21.51, "elapsed_time": "1h 49m 50s", "remaining_time": "9h 46m 50s", "loss_scale": 1.0, "consumed_samples": 256768, "global_step/max_steps": "1003/6362"} +{"lm loss": 5.18299627, "grad_norm": 1.29045486, "learning_rate": 9.695e-05, "elapsed_time_per_iteration": 6.53453279, "memory(GiB)": 21.51, "elapsed_time": "1h 49m 56s", "remaining_time": "9h 46m 43s", "loss_scale": 1.0, "consumed_samples": 257024, "global_step/max_steps": "1004/6362"} +{"lm loss": 5.19821692, "grad_norm": 0.87816274, "learning_rate": 9.694e-05, "elapsed_time_per_iteration": 6.38899589, "memory(GiB)": 21.51, "elapsed_time": "1h 50m 2s", "remaining_time": "9h 46m 36s", "loss_scale": 1.0, "consumed_samples": 257280, "global_step/max_steps": "1005/6362"} +{"lm loss": 5.20769978, "grad_norm": 0.96809626, "learning_rate": 9.693e-05, "elapsed_time_per_iteration": 6.32464838, "memory(GiB)": 21.51, "elapsed_time": "1h 50m 9s", "remaining_time": "9h 46m 28s", "loss_scale": 1.0, "consumed_samples": 257536, "global_step/max_steps": "1006/6362"} +{"lm loss": 5.21060133, "grad_norm": 1.00722408, "learning_rate": 9.692e-05, "elapsed_time_per_iteration": 6.33063173, "memory(GiB)": 21.51, "elapsed_time": "1h 50m 15s", "remaining_time": "9h 46m 20s", "loss_scale": 1.0, "consumed_samples": 257792, "global_step/max_steps": "1007/6362"} +{"lm loss": 5.20512486, "grad_norm": 1.10257983, "learning_rate": 9.691e-05, "elapsed_time_per_iteration": 6.45513821, "memory(GiB)": 21.51, "elapsed_time": "1h 50m 22s", "remaining_time": "9h 46m 13s", "loss_scale": 1.0, "consumed_samples": 258048, "global_step/max_steps": "1008/6362"} +{"lm loss": 5.20667315, "grad_norm": 0.97043133, "learning_rate": 9.691e-05, "elapsed_time_per_iteration": 6.52780962, "memory(GiB)": 21.51, "elapsed_time": "1h 50m 28s", "remaining_time": "9h 46m 6s", "loss_scale": 1.0, "consumed_samples": 258304, "global_step/max_steps": "1009/6362"} +{"lm loss": 5.20193815, "grad_norm": 1.03291261, "learning_rate": 9.69e-05, "elapsed_time_per_iteration": 6.41821861, "memory(GiB)": 21.51, "elapsed_time": "1h 50m 35s", "remaining_time": "9h 45m 59s", "loss_scale": 1.0, "consumed_samples": 258560, "global_step/max_steps": "1010/6362"} +{"lm loss": 5.19764185, "grad_norm": 1.22854269, "learning_rate": 9.689e-05, "elapsed_time_per_iteration": 6.28791738, "memory(GiB)": 21.51, "elapsed_time": "1h 50m 41s", "remaining_time": "9h 45m 51s", "loss_scale": 1.0, "consumed_samples": 258816, "global_step/max_steps": "1011/6362"} +{"lm loss": 5.20751429, "grad_norm": 1.06026626, "learning_rate": 9.688e-05, "elapsed_time_per_iteration": 6.6206944, "memory(GiB)": 21.51, "elapsed_time": "1h 50m 47s", "remaining_time": "9h 45m 44s", "loss_scale": 1.0, "consumed_samples": 259072, "global_step/max_steps": "1012/6362"} +{"lm loss": 5.20742702, "grad_norm": 0.99033064, "learning_rate": 9.687e-05, "elapsed_time_per_iteration": 6.89932632, "memory(GiB)": 21.51, "elapsed_time": "1h 50m 54s", "remaining_time": "9h 45m 40s", "loss_scale": 1.0, "consumed_samples": 259328, "global_step/max_steps": "1013/6362"} +{"lm loss": 5.19870138, "grad_norm": 0.9538101, "learning_rate": 9.686e-05, "elapsed_time_per_iteration": 6.64802814, "memory(GiB)": 21.51, "elapsed_time": "1h 51m 1s", "remaining_time": "9h 45m 33s", "loss_scale": 1.0, "consumed_samples": 259584, "global_step/max_steps": "1014/6362"} +{"lm loss": 5.19155455, "grad_norm": 0.96138096, "learning_rate": 9.685e-05, "elapsed_time_per_iteration": 6.36301136, "memory(GiB)": 21.51, "elapsed_time": "1h 51m 7s", "remaining_time": "9h 45m 26s", "loss_scale": 1.0, "consumed_samples": 259840, "global_step/max_steps": "1015/6362"} +{"lm loss": 5.18151522, "grad_norm": 0.95679855, "learning_rate": 9.684e-05, "elapsed_time_per_iteration": 6.45917344, "memory(GiB)": 21.51, "elapsed_time": "1h 51m 14s", "remaining_time": "9h 45m 19s", "loss_scale": 1.0, "consumed_samples": 260096, "global_step/max_steps": "1016/6362"} +{"lm loss": 5.17189598, "grad_norm": 1.17295504, "learning_rate": 9.683e-05, "elapsed_time_per_iteration": 6.51044297, "memory(GiB)": 21.51, "elapsed_time": "1h 51m 20s", "remaining_time": "9h 45m 12s", "loss_scale": 1.0, "consumed_samples": 260352, "global_step/max_steps": "1017/6362"} +{"lm loss": 5.17681026, "grad_norm": 0.91079408, "learning_rate": 9.683e-05, "elapsed_time_per_iteration": 6.52425885, "memory(GiB)": 21.51, "elapsed_time": "1h 51m 27s", "remaining_time": "9h 45m 5s", "loss_scale": 1.0, "consumed_samples": 260608, "global_step/max_steps": "1018/6362"} +{"lm loss": 5.16864157, "grad_norm": 0.96230727, "learning_rate": 9.682e-05, "elapsed_time_per_iteration": 6.5952208, "memory(GiB)": 21.51, "elapsed_time": "1h 51m 33s", "remaining_time": "9h 44m 58s", "loss_scale": 1.0, "consumed_samples": 260864, "global_step/max_steps": "1019/6362"} +{"lm loss": 5.18442774, "grad_norm": 1.0604502, "learning_rate": 9.681e-05, "elapsed_time_per_iteration": 6.44821763, "memory(GiB)": 21.51, "elapsed_time": "1h 51m 40s", "remaining_time": "9h 44m 51s", "loss_scale": 1.0, "consumed_samples": 261120, "global_step/max_steps": "1020/6362"} +{"lm loss": 5.18892241, "grad_norm": 1.13225353, "learning_rate": 9.68e-05, "elapsed_time_per_iteration": 6.82606745, "memory(GiB)": 21.51, "elapsed_time": "1h 51m 47s", "remaining_time": "9h 44m 46s", "loss_scale": 1.0, "consumed_samples": 261376, "global_step/max_steps": "1021/6362"} +{"lm loss": 5.19014502, "grad_norm": 0.97204328, "learning_rate": 9.679e-05, "elapsed_time_per_iteration": 6.85066676, "memory(GiB)": 21.51, "elapsed_time": "1h 51m 54s", "remaining_time": "9h 44m 41s", "loss_scale": 1.0, "consumed_samples": 261632, "global_step/max_steps": "1022/6362"} +{"lm loss": 5.17099762, "grad_norm": 1.00232649, "learning_rate": 9.678e-05, "elapsed_time_per_iteration": 6.5775969, "memory(GiB)": 21.51, "elapsed_time": "1h 52m 0s", "remaining_time": "9h 44m 34s", "loss_scale": 1.0, "consumed_samples": 261888, "global_step/max_steps": "1023/6362"} +{"lm loss": 5.2048254, "grad_norm": 1.07739246, "learning_rate": 9.677e-05, "elapsed_time_per_iteration": 6.46691251, "memory(GiB)": 21.51, "elapsed_time": "1h 52m 7s", "remaining_time": "9h 44m 27s", "loss_scale": 1.0, "consumed_samples": 262144, "global_step/max_steps": "1024/6362"} +{"lm loss": 5.18379021, "grad_norm": 0.99341214, "learning_rate": 9.676e-05, "elapsed_time_per_iteration": 6.65121222, "memory(GiB)": 21.51, "elapsed_time": "1h 52m 13s", "remaining_time": "9h 44m 21s", "loss_scale": 1.0, "consumed_samples": 262400, "global_step/max_steps": "1025/6362"} +{"lm loss": 5.17069435, "grad_norm": 0.98458445, "learning_rate": 9.675e-05, "elapsed_time_per_iteration": 6.62027049, "memory(GiB)": 21.51, "elapsed_time": "1h 52m 20s", "remaining_time": "9h 44m 15s", "loss_scale": 1.0, "consumed_samples": 262656, "global_step/max_steps": "1026/6362"} +{"lm loss": 5.19585323, "grad_norm": 1.01616001, "learning_rate": 9.674e-05, "elapsed_time_per_iteration": 6.73543501, "memory(GiB)": 21.51, "elapsed_time": "1h 52m 27s", "remaining_time": "9h 44m 9s", "loss_scale": 1.0, "consumed_samples": 262912, "global_step/max_steps": "1027/6362"} +{"lm loss": 5.17787933, "grad_norm": 1.10264456, "learning_rate": 9.674e-05, "elapsed_time_per_iteration": 6.64305711, "memory(GiB)": 21.51, "elapsed_time": "1h 52m 33s", "remaining_time": "9h 44m 3s", "loss_scale": 1.0, "consumed_samples": 263168, "global_step/max_steps": "1028/6362"} +{"lm loss": 5.20795536, "grad_norm": 0.88222122, "learning_rate": 9.673e-05, "elapsed_time_per_iteration": 6.7607286, "memory(GiB)": 21.51, "elapsed_time": "1h 52m 40s", "remaining_time": "9h 43m 57s", "loss_scale": 1.0, "consumed_samples": 263424, "global_step/max_steps": "1029/6362"} +{"lm loss": 5.20746565, "grad_norm": 1.04400456, "learning_rate": 9.672e-05, "elapsed_time_per_iteration": 6.5607307, "memory(GiB)": 21.51, "elapsed_time": "1h 52m 47s", "remaining_time": "9h 43m 51s", "loss_scale": 1.0, "consumed_samples": 263680, "global_step/max_steps": "1030/6362"} +{"lm loss": 5.1942935, "grad_norm": 1.1478852, "learning_rate": 9.671e-05, "elapsed_time_per_iteration": 6.59510279, "memory(GiB)": 21.51, "elapsed_time": "1h 52m 53s", "remaining_time": "9h 43m 44s", "loss_scale": 1.0, "consumed_samples": 263936, "global_step/max_steps": "1031/6362"} +{"lm loss": 5.18195391, "grad_norm": 0.91190279, "learning_rate": 9.67e-05, "elapsed_time_per_iteration": 6.54325771, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 0s", "remaining_time": "9h 43m 38s", "loss_scale": 1.0, "consumed_samples": 264192, "global_step/max_steps": "1032/6362"} +{"lm loss": 5.17545557, "grad_norm": 1.12894964, "learning_rate": 9.669e-05, "elapsed_time_per_iteration": 6.644912, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 6s", "remaining_time": "9h 43m 31s", "loss_scale": 1.0, "consumed_samples": 264448, "global_step/max_steps": "1033/6362"} +{"lm loss": 5.18064737, "grad_norm": 1.03944886, "learning_rate": 9.668e-05, "elapsed_time_per_iteration": 6.50987625, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 13s", "remaining_time": "9h 43m 25s", "loss_scale": 1.0, "consumed_samples": 264704, "global_step/max_steps": "1034/6362"} +{"lm loss": 5.18248272, "grad_norm": 0.82880402, "learning_rate": 9.667e-05, "elapsed_time_per_iteration": 6.71524572, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 20s", "remaining_time": "9h 43m 19s", "loss_scale": 1.0, "consumed_samples": 264960, "global_step/max_steps": "1035/6362"} +{"lm loss": 5.1991868, "grad_norm": 0.81278867, "learning_rate": 9.666e-05, "elapsed_time_per_iteration": 6.6346333, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 26s", "remaining_time": "9h 43m 12s", "loss_scale": 1.0, "consumed_samples": 265216, "global_step/max_steps": "1036/6362"} +{"lm loss": 5.19329262, "grad_norm": 0.82327873, "learning_rate": 9.665e-05, "elapsed_time_per_iteration": 6.73102856, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 33s", "remaining_time": "9h 43m 7s", "loss_scale": 1.0, "consumed_samples": 265472, "global_step/max_steps": "1037/6362"} +{"lm loss": 5.19059992, "grad_norm": 0.92343843, "learning_rate": 9.664e-05, "elapsed_time_per_iteration": 6.42845488, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 39s", "remaining_time": "9h 42m 59s", "loss_scale": 1.0, "consumed_samples": 265728, "global_step/max_steps": "1038/6362"} +{"lm loss": 5.18588877, "grad_norm": 1.04815197, "learning_rate": 9.663e-05, "elapsed_time_per_iteration": 6.46869612, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 46s", "remaining_time": "9h 42m 52s", "loss_scale": 1.0, "consumed_samples": 265984, "global_step/max_steps": "1039/6362"} +{"lm loss": 5.16845846, "grad_norm": 0.93400675, "learning_rate": 9.663e-05, "elapsed_time_per_iteration": 6.25245905, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 52s", "remaining_time": "9h 42m 44s", "loss_scale": 1.0, "consumed_samples": 266240, "global_step/max_steps": "1040/6362"} +{"lm loss": 5.16707706, "grad_norm": 0.94794244, "learning_rate": 9.662e-05, "elapsed_time_per_iteration": 6.36532068, "memory(GiB)": 21.51, "elapsed_time": "1h 53m 58s", "remaining_time": "9h 42m 37s", "loss_scale": 1.0, "consumed_samples": 266496, "global_step/max_steps": "1041/6362"} +{"lm loss": 5.18723011, "grad_norm": 1.02845919, "learning_rate": 9.661e-05, "elapsed_time_per_iteration": 6.53104472, "memory(GiB)": 21.51, "elapsed_time": "1h 54m 5s", "remaining_time": "9h 42m 30s", "loss_scale": 1.0, "consumed_samples": 266752, "global_step/max_steps": "1042/6362"} +{"lm loss": 5.18178701, "grad_norm": 1.09756494, "learning_rate": 9.66e-05, "elapsed_time_per_iteration": 6.53693366, "memory(GiB)": 21.51, "elapsed_time": "1h 54m 12s", "remaining_time": "9h 42m 23s", "loss_scale": 1.0, "consumed_samples": 267008, "global_step/max_steps": "1043/6362"} +{"lm loss": 5.19342613, "grad_norm": 0.96095186, "learning_rate": 9.659e-05, "elapsed_time_per_iteration": 6.56137109, "memory(GiB)": 21.51, "elapsed_time": "1h 54m 18s", "remaining_time": "9h 42m 16s", "loss_scale": 1.0, "consumed_samples": 267264, "global_step/max_steps": "1044/6362"} +{"lm loss": 5.17854738, "grad_norm": 1.03542447, "learning_rate": 9.658e-05, "elapsed_time_per_iteration": 6.3083005, "memory(GiB)": 21.51, "elapsed_time": "1h 54m 24s", "remaining_time": "9h 42m 9s", "loss_scale": 1.0, "consumed_samples": 267520, "global_step/max_steps": "1045/6362"} +{"lm loss": 5.1765933, "grad_norm": 1.12208784, "learning_rate": 9.657e-05, "elapsed_time_per_iteration": 6.52037954, "memory(GiB)": 21.51, "elapsed_time": "1h 54m 31s", "remaining_time": "9h 42m 2s", "loss_scale": 1.0, "consumed_samples": 267776, "global_step/max_steps": "1046/6362"} +{"lm loss": 5.18161631, "grad_norm": 0.95245951, "learning_rate": 9.656e-05, "elapsed_time_per_iteration": 6.33841538, "memory(GiB)": 21.51, "elapsed_time": "1h 54m 37s", "remaining_time": "9h 41m 54s", "loss_scale": 1.0, "consumed_samples": 268032, "global_step/max_steps": "1047/6362"} +{"lm loss": 5.18274355, "grad_norm": 1.01095653, "learning_rate": 9.655e-05, "elapsed_time_per_iteration": 6.66713119, "memory(GiB)": 21.51, "elapsed_time": "1h 54m 44s", "remaining_time": "9h 41m 48s", "loss_scale": 1.0, "consumed_samples": 268288, "global_step/max_steps": "1048/6362"} +{"lm loss": 5.1593895, "grad_norm": 1.10523188, "learning_rate": 9.654e-05, "elapsed_time_per_iteration": 6.6739192, "memory(GiB)": 21.51, "elapsed_time": "1h 54m 51s", "remaining_time": "9h 41m 42s", "loss_scale": 1.0, "consumed_samples": 268544, "global_step/max_steps": "1049/6362"} +{"lm loss": 5.16153812, "grad_norm": 0.91680413, "learning_rate": 9.653e-05, "elapsed_time_per_iteration": 6.69511628, "memory(GiB)": 21.51, "elapsed_time": "1h 54m 57s", "remaining_time": "9h 41m 36s", "loss_scale": 1.0, "consumed_samples": 268800, "global_step/max_steps": "1050/6362"} +{"lm loss": 5.16992426, "grad_norm": 1.06112885, "learning_rate": 9.652e-05, "elapsed_time_per_iteration": 6.65709805, "memory(GiB)": 21.51, "elapsed_time": "1h 55m 4s", "remaining_time": "9h 41m 30s", "loss_scale": 1.0, "consumed_samples": 269056, "global_step/max_steps": "1051/6362"} +{"lm loss": 5.17057848, "grad_norm": 1.18445623, "learning_rate": 9.651e-05, "elapsed_time_per_iteration": 6.42081547, "memory(GiB)": 21.51, "elapsed_time": "1h 55m 10s", "remaining_time": "9h 41m 22s", "loss_scale": 1.0, "consumed_samples": 269312, "global_step/max_steps": "1052/6362"} +{"lm loss": 5.17381763, "grad_norm": 0.86122733, "learning_rate": 9.65e-05, "elapsed_time_per_iteration": 6.39079452, "memory(GiB)": 21.51, "elapsed_time": "1h 55m 17s", "remaining_time": "9h 41m 15s", "loss_scale": 1.0, "consumed_samples": 269568, "global_step/max_steps": "1053/6362"} +{"lm loss": 5.19846106, "grad_norm": 0.91587341, "learning_rate": 9.649e-05, "elapsed_time_per_iteration": 6.48772836, "memory(GiB)": 21.51, "elapsed_time": "1h 55m 23s", "remaining_time": "9h 41m 8s", "loss_scale": 1.0, "consumed_samples": 269824, "global_step/max_steps": "1054/6362"} +{"lm loss": 5.1937232, "grad_norm": 0.93587995, "learning_rate": 9.649e-05, "elapsed_time_per_iteration": 6.46972013, "memory(GiB)": 21.51, "elapsed_time": "1h 55m 30s", "remaining_time": "9h 41m 1s", "loss_scale": 1.0, "consumed_samples": 270080, "global_step/max_steps": "1055/6362"} +{"lm loss": 5.18740463, "grad_norm": 0.94111997, "learning_rate": 9.648e-05, "elapsed_time_per_iteration": 6.61070323, "memory(GiB)": 21.51, "elapsed_time": "1h 55m 36s", "remaining_time": "9h 40m 55s", "loss_scale": 1.0, "consumed_samples": 270336, "global_step/max_steps": "1056/6362"} +{"lm loss": 5.19946766, "grad_norm": 1.04119301, "learning_rate": 9.647e-05, "elapsed_time_per_iteration": 6.44347787, "memory(GiB)": 21.51, "elapsed_time": "1h 55m 43s", "remaining_time": "9h 40m 47s", "loss_scale": 1.0, "consumed_samples": 270592, "global_step/max_steps": "1057/6362"} +{"lm loss": 5.17306185, "grad_norm": 1.08284426, "learning_rate": 9.646e-05, "elapsed_time_per_iteration": 6.61694717, "memory(GiB)": 21.51, "elapsed_time": "1h 55m 49s", "remaining_time": "9h 40m 41s", "loss_scale": 1.0, "consumed_samples": 270848, "global_step/max_steps": "1058/6362"} +{"lm loss": 5.1771698, "grad_norm": 0.87472671, "learning_rate": 9.645e-05, "elapsed_time_per_iteration": 6.69863605, "memory(GiB)": 21.51, "elapsed_time": "1h 55m 56s", "remaining_time": "9h 40m 35s", "loss_scale": 1.0, "consumed_samples": 271104, "global_step/max_steps": "1059/6362"} +{"lm loss": 5.16900635, "grad_norm": 0.88772076, "learning_rate": 9.644e-05, "elapsed_time_per_iteration": 6.50791574, "memory(GiB)": 21.51, "elapsed_time": "1h 56m 3s", "remaining_time": "9h 40m 28s", "loss_scale": 1.0, "consumed_samples": 271360, "global_step/max_steps": "1060/6362"} +{"lm loss": 5.1840148, "grad_norm": 0.93509656, "learning_rate": 9.643e-05, "elapsed_time_per_iteration": 6.68326616, "memory(GiB)": 21.51, "elapsed_time": "1h 56m 9s", "remaining_time": "9h 40m 22s", "loss_scale": 1.0, "consumed_samples": 271616, "global_step/max_steps": "1061/6362"} +{"lm loss": 5.17490005, "grad_norm": 0.92550677, "learning_rate": 9.642e-05, "elapsed_time_per_iteration": 6.60022807, "memory(GiB)": 21.51, "elapsed_time": "1h 56m 16s", "remaining_time": "9h 40m 16s", "loss_scale": 1.0, "consumed_samples": 271872, "global_step/max_steps": "1062/6362"} +{"lm loss": 5.16008949, "grad_norm": 0.94712049, "learning_rate": 9.641e-05, "elapsed_time_per_iteration": 6.4698081, "memory(GiB)": 21.51, "elapsed_time": "1h 56m 22s", "remaining_time": "9h 40m 9s", "loss_scale": 1.0, "consumed_samples": 272128, "global_step/max_steps": "1063/6362"} +{"lm loss": 5.13962507, "grad_norm": 1.07837307, "learning_rate": 9.64e-05, "elapsed_time_per_iteration": 6.52119017, "memory(GiB)": 21.51, "elapsed_time": "1h 56m 29s", "remaining_time": "9h 40m 2s", "loss_scale": 1.0, "consumed_samples": 272384, "global_step/max_steps": "1064/6362"} +{"lm loss": 5.17146254, "grad_norm": 0.91773134, "learning_rate": 9.639e-05, "elapsed_time_per_iteration": 6.32047367, "memory(GiB)": 21.51, "elapsed_time": "1h 56m 35s", "remaining_time": "9h 39m 54s", "loss_scale": 1.0, "consumed_samples": 272640, "global_step/max_steps": "1065/6362"} +{"lm loss": 5.15677977, "grad_norm": 0.90356618, "learning_rate": 9.638e-05, "elapsed_time_per_iteration": 6.55511975, "memory(GiB)": 21.51, "elapsed_time": "1h 56m 42s", "remaining_time": "9h 39m 48s", "loss_scale": 1.0, "consumed_samples": 272896, "global_step/max_steps": "1066/6362"} +{"lm loss": 5.17020035, "grad_norm": 1.10277534, "learning_rate": 9.637e-05, "elapsed_time_per_iteration": 6.37512994, "memory(GiB)": 21.51, "elapsed_time": "1h 56m 48s", "remaining_time": "9h 39m 40s", "loss_scale": 1.0, "consumed_samples": 273152, "global_step/max_steps": "1067/6362"} +{"lm loss": 5.16859245, "grad_norm": 1.09185517, "learning_rate": 9.636e-05, "elapsed_time_per_iteration": 6.44439054, "memory(GiB)": 21.51, "elapsed_time": "1h 56m 55s", "remaining_time": "9h 39m 33s", "loss_scale": 1.0, "consumed_samples": 273408, "global_step/max_steps": "1068/6362"} +{"lm loss": 5.16287422, "grad_norm": 0.89998972, "learning_rate": 9.635e-05, "elapsed_time_per_iteration": 6.57825208, "memory(GiB)": 21.51, "elapsed_time": "1h 57m 1s", "remaining_time": "9h 39m 26s", "loss_scale": 1.0, "consumed_samples": 273664, "global_step/max_steps": "1069/6362"} +{"lm loss": 5.16365242, "grad_norm": 0.82666099, "learning_rate": 9.634e-05, "elapsed_time_per_iteration": 6.54324698, "memory(GiB)": 21.51, "elapsed_time": "1h 57m 8s", "remaining_time": "9h 39m 20s", "loss_scale": 1.0, "consumed_samples": 273920, "global_step/max_steps": "1070/6362"} +{"lm loss": 5.17165661, "grad_norm": 0.75369757, "learning_rate": 9.633e-05, "elapsed_time_per_iteration": 6.64875269, "memory(GiB)": 21.51, "elapsed_time": "1h 57m 14s", "remaining_time": "9h 39m 13s", "loss_scale": 1.0, "consumed_samples": 274176, "global_step/max_steps": "1071/6362"} +{"lm loss": 5.17704344, "grad_norm": 0.76282215, "learning_rate": 9.632e-05, "elapsed_time_per_iteration": 6.54026008, "memory(GiB)": 21.51, "elapsed_time": "1h 57m 21s", "remaining_time": "9h 39m 7s", "loss_scale": 1.0, "consumed_samples": 274432, "global_step/max_steps": "1072/6362"} +{"lm loss": 5.18136358, "grad_norm": 0.86124861, "learning_rate": 9.631e-05, "elapsed_time_per_iteration": 6.42259288, "memory(GiB)": 21.51, "elapsed_time": "1h 57m 27s", "remaining_time": "9h 38m 59s", "loss_scale": 1.0, "consumed_samples": 274688, "global_step/max_steps": "1073/6362"} +{"lm loss": 5.1544776, "grad_norm": 0.9614802, "learning_rate": 9.63e-05, "elapsed_time_per_iteration": 6.49896669, "memory(GiB)": 21.51, "elapsed_time": "1h 57m 34s", "remaining_time": "9h 38m 53s", "loss_scale": 1.0, "consumed_samples": 274944, "global_step/max_steps": "1074/6362"} +{"lm loss": 5.18040037, "grad_norm": 1.12997866, "learning_rate": 9.629e-05, "elapsed_time_per_iteration": 6.36873651, "memory(GiB)": 21.51, "elapsed_time": "1h 57m 40s", "remaining_time": "9h 38m 45s", "loss_scale": 1.0, "consumed_samples": 275200, "global_step/max_steps": "1075/6362"} +{"lm loss": 5.15152311, "grad_norm": 0.98441982, "learning_rate": 9.628e-05, "elapsed_time_per_iteration": 6.62463355, "memory(GiB)": 21.51, "elapsed_time": "1h 57m 47s", "remaining_time": "9h 38m 39s", "loss_scale": 1.0, "consumed_samples": 275456, "global_step/max_steps": "1076/6362"} +{"lm loss": 5.14974451, "grad_norm": 1.11030757, "learning_rate": 9.628e-05, "elapsed_time_per_iteration": 6.50352311, "memory(GiB)": 21.51, "elapsed_time": "1h 57m 53s", "remaining_time": "9h 38m 32s", "loss_scale": 1.0, "consumed_samples": 275712, "global_step/max_steps": "1077/6362"} +{"lm loss": 5.17014456, "grad_norm": 0.94340366, "learning_rate": 9.627e-05, "elapsed_time_per_iteration": 6.77382588, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 0s", "remaining_time": "9h 38m 26s", "loss_scale": 1.0, "consumed_samples": 275968, "global_step/max_steps": "1078/6362"} +{"lm loss": 5.16642427, "grad_norm": 1.13108957, "learning_rate": 9.626e-05, "elapsed_time_per_iteration": 6.58098197, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 7s", "remaining_time": "9h 38m 20s", "loss_scale": 1.0, "consumed_samples": 276224, "global_step/max_steps": "1079/6362"} +{"lm loss": 5.16736698, "grad_norm": 0.96640754, "learning_rate": 9.625e-05, "elapsed_time_per_iteration": 6.6067121, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 13s", "remaining_time": "9h 38m 13s", "loss_scale": 1.0, "consumed_samples": 276480, "global_step/max_steps": "1080/6362"} +{"lm loss": 5.17998934, "grad_norm": 0.96465123, "learning_rate": 9.624e-05, "elapsed_time_per_iteration": 6.56496525, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 20s", "remaining_time": "9h 38m 7s", "loss_scale": 1.0, "consumed_samples": 276736, "global_step/max_steps": "1081/6362"} +{"lm loss": 5.1785574, "grad_norm": 0.99102849, "learning_rate": 9.623e-05, "elapsed_time_per_iteration": 6.39472461, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 26s", "remaining_time": "9h 37m 59s", "loss_scale": 1.0, "consumed_samples": 276992, "global_step/max_steps": "1082/6362"} +{"lm loss": 5.16709328, "grad_norm": 0.98920023, "learning_rate": 9.622e-05, "elapsed_time_per_iteration": 6.42864633, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 33s", "remaining_time": "9h 37m 52s", "loss_scale": 1.0, "consumed_samples": 277248, "global_step/max_steps": "1083/6362"} +{"lm loss": 5.17171812, "grad_norm": 1.13674259, "learning_rate": 9.621e-05, "elapsed_time_per_iteration": 6.41425419, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 39s", "remaining_time": "9h 37m 45s", "loss_scale": 1.0, "consumed_samples": 277504, "global_step/max_steps": "1084/6362"} +{"lm loss": 5.18997002, "grad_norm": 1.08588803, "learning_rate": 9.62e-05, "elapsed_time_per_iteration": 6.6652596, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 46s", "remaining_time": "9h 37m 39s", "loss_scale": 1.0, "consumed_samples": 277760, "global_step/max_steps": "1085/6362"} +{"lm loss": 5.17106056, "grad_norm": 0.92585999, "learning_rate": 9.619e-05, "elapsed_time_per_iteration": 6.54802799, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 52s", "remaining_time": "9h 37m 32s", "loss_scale": 1.0, "consumed_samples": 278016, "global_step/max_steps": "1086/6362"} +{"lm loss": 5.17975998, "grad_norm": 1.19285369, "learning_rate": 9.618e-05, "elapsed_time_per_iteration": 6.82804012, "memory(GiB)": 21.51, "elapsed_time": "1h 58m 59s", "remaining_time": "9h 37m 27s", "loss_scale": 1.0, "consumed_samples": 278272, "global_step/max_steps": "1087/6362"} +{"lm loss": 5.17943668, "grad_norm": 0.86648709, "learning_rate": 9.617e-05, "elapsed_time_per_iteration": 6.50620246, "memory(GiB)": 21.51, "elapsed_time": "1h 59m 6s", "remaining_time": "9h 37m 20s", "loss_scale": 1.0, "consumed_samples": 278528, "global_step/max_steps": "1088/6362"} +{"lm loss": 5.17043781, "grad_norm": 0.74808931, "learning_rate": 9.616e-05, "elapsed_time_per_iteration": 6.6625669, "memory(GiB)": 21.51, "elapsed_time": "1h 59m 12s", "remaining_time": "9h 37m 14s", "loss_scale": 1.0, "consumed_samples": 278784, "global_step/max_steps": "1089/6362"} +{"lm loss": 5.17575645, "grad_norm": 0.87852252, "learning_rate": 9.615e-05, "elapsed_time_per_iteration": 6.57796407, "memory(GiB)": 21.51, "elapsed_time": "1h 59m 19s", "remaining_time": "9h 37m 7s", "loss_scale": 1.0, "consumed_samples": 279040, "global_step/max_steps": "1090/6362"} +{"lm loss": 5.17975855, "grad_norm": 1.06465161, "learning_rate": 9.614e-05, "elapsed_time_per_iteration": 6.61507773, "memory(GiB)": 21.51, "elapsed_time": "1h 59m 25s", "remaining_time": "9h 37m 1s", "loss_scale": 1.0, "consumed_samples": 279296, "global_step/max_steps": "1091/6362"} +{"lm loss": 5.17050457, "grad_norm": 0.93577361, "learning_rate": 9.613e-05, "elapsed_time_per_iteration": 6.49495792, "memory(GiB)": 21.51, "elapsed_time": "1h 59m 32s", "remaining_time": "9h 36m 54s", "loss_scale": 1.0, "consumed_samples": 279552, "global_step/max_steps": "1092/6362"} +{"lm loss": 5.1743865, "grad_norm": 0.96464705, "learning_rate": 9.612e-05, "elapsed_time_per_iteration": 6.58257651, "memory(GiB)": 21.51, "elapsed_time": "1h 59m 39s", "remaining_time": "9h 36m 47s", "loss_scale": 1.0, "consumed_samples": 279808, "global_step/max_steps": "1093/6362"} +{"lm loss": 5.17078972, "grad_norm": 1.04467642, "learning_rate": 9.611e-05, "elapsed_time_per_iteration": 6.41371727, "memory(GiB)": 21.51, "elapsed_time": "1h 59m 45s", "remaining_time": "9h 36m 40s", "loss_scale": 1.0, "consumed_samples": 280064, "global_step/max_steps": "1094/6362"} +{"lm loss": 5.19288635, "grad_norm": 0.99346811, "learning_rate": 9.61e-05, "elapsed_time_per_iteration": 6.47221804, "memory(GiB)": 21.51, "elapsed_time": "1h 59m 51s", "remaining_time": "9h 36m 33s", "loss_scale": 1.0, "consumed_samples": 280320, "global_step/max_steps": "1095/6362"} +{"lm loss": 5.15594912, "grad_norm": 0.99128211, "learning_rate": 9.609e-05, "elapsed_time_per_iteration": 6.54706955, "memory(GiB)": 21.51, "elapsed_time": "1h 59m 58s", "remaining_time": "9h 36m 26s", "loss_scale": 1.0, "consumed_samples": 280576, "global_step/max_steps": "1096/6362"} +{"lm loss": 5.16336107, "grad_norm": 1.02338445, "learning_rate": 9.608e-05, "elapsed_time_per_iteration": 6.47421622, "memory(GiB)": 21.51, "elapsed_time": "2h 0m 4s", "remaining_time": "9h 36m 19s", "loss_scale": 1.0, "consumed_samples": 280832, "global_step/max_steps": "1097/6362"} +{"lm loss": 5.16498995, "grad_norm": 0.94721442, "learning_rate": 9.607e-05, "elapsed_time_per_iteration": 6.52620435, "memory(GiB)": 21.51, "elapsed_time": "2h 0m 11s", "remaining_time": "9h 36m 13s", "loss_scale": 1.0, "consumed_samples": 281088, "global_step/max_steps": "1098/6362"} +{"lm loss": 5.17269945, "grad_norm": 1.10388207, "learning_rate": 9.606e-05, "elapsed_time_per_iteration": 6.62417579, "memory(GiB)": 21.51, "elapsed_time": "2h 0m 18s", "remaining_time": "9h 36m 6s", "loss_scale": 1.0, "consumed_samples": 281344, "global_step/max_steps": "1099/6362"} +{"lm loss": 5.17574739, "grad_norm": 1.41144311, "learning_rate": 9.605e-05, "elapsed_time_per_iteration": 6.64016819, "memory(GiB)": 21.51, "elapsed_time": "2h 0m 24s", "remaining_time": "9h 36m 0s", "loss_scale": 1.0, "consumed_samples": 281600, "global_step/max_steps": "1100/6362"} +{"lm loss": 5.15622473, "grad_norm": 0.80152869, "learning_rate": 9.604e-05, "elapsed_time_per_iteration": 6.45319366, "memory(GiB)": 21.51, "elapsed_time": "2h 0m 31s", "remaining_time": "9h 35m 53s", "loss_scale": 1.0, "consumed_samples": 281856, "global_step/max_steps": "1101/6362"} +{"lm loss": 5.16715431, "grad_norm": 1.01688254, "learning_rate": 9.603e-05, "elapsed_time_per_iteration": 6.45775199, "memory(GiB)": 21.51, "elapsed_time": "2h 0m 37s", "remaining_time": "9h 35m 46s", "loss_scale": 1.0, "consumed_samples": 282112, "global_step/max_steps": "1102/6362"} +{"lm loss": 5.16128159, "grad_norm": 1.08461344, "learning_rate": 9.602e-05, "elapsed_time_per_iteration": 6.46851587, "memory(GiB)": 21.51, "elapsed_time": "2h 0m 44s", "remaining_time": "9h 35m 39s", "loss_scale": 1.0, "consumed_samples": 282368, "global_step/max_steps": "1103/6362"} +{"lm loss": 5.17902899, "grad_norm": 0.99714518, "learning_rate": 9.601e-05, "elapsed_time_per_iteration": 6.5332315, "memory(GiB)": 21.51, "elapsed_time": "2h 0m 50s", "remaining_time": "9h 35m 32s", "loss_scale": 1.0, "consumed_samples": 282624, "global_step/max_steps": "1104/6362"} +{"lm loss": 5.16298342, "grad_norm": 1.13209248, "learning_rate": 9.6e-05, "elapsed_time_per_iteration": 6.41655946, "memory(GiB)": 21.51, "elapsed_time": "2h 0m 57s", "remaining_time": "9h 35m 25s", "loss_scale": 1.0, "consumed_samples": 282880, "global_step/max_steps": "1105/6362"} +{"lm loss": 5.15875912, "grad_norm": 0.89333898, "learning_rate": 9.599e-05, "elapsed_time_per_iteration": 6.59641981, "memory(GiB)": 21.51, "elapsed_time": "2h 1m 3s", "remaining_time": "9h 35m 18s", "loss_scale": 1.0, "consumed_samples": 283136, "global_step/max_steps": "1106/6362"} +{"lm loss": 5.16897058, "grad_norm": 0.86387938, "learning_rate": 9.598e-05, "elapsed_time_per_iteration": 6.42111039, "memory(GiB)": 21.51, "elapsed_time": "2h 1m 10s", "remaining_time": "9h 35m 11s", "loss_scale": 1.0, "consumed_samples": 283392, "global_step/max_steps": "1107/6362"} +{"lm loss": 5.15631628, "grad_norm": 0.86714596, "learning_rate": 9.597e-05, "elapsed_time_per_iteration": 6.5872066, "memory(GiB)": 21.51, "elapsed_time": "2h 1m 16s", "remaining_time": "9h 35m 5s", "loss_scale": 1.0, "consumed_samples": 283648, "global_step/max_steps": "1108/6362"} +{"lm loss": 5.15753603, "grad_norm": 0.82774031, "learning_rate": 9.596e-05, "elapsed_time_per_iteration": 6.4656148, "memory(GiB)": 21.51, "elapsed_time": "2h 1m 23s", "remaining_time": "9h 34m 58s", "loss_scale": 1.0, "consumed_samples": 283904, "global_step/max_steps": "1109/6362"} +{"lm loss": 5.1547451, "grad_norm": 0.91588533, "learning_rate": 9.595e-05, "elapsed_time_per_iteration": 6.60485578, "memory(GiB)": 21.51, "elapsed_time": "2h 1m 29s", "remaining_time": "9h 34m 51s", "loss_scale": 1.0, "consumed_samples": 284160, "global_step/max_steps": "1110/6362"} +{"lm loss": 5.16489935, "grad_norm": 1.15282786, "learning_rate": 9.594e-05, "elapsed_time_per_iteration": 6.405123, "memory(GiB)": 21.51, "elapsed_time": "2h 1m 36s", "remaining_time": "9h 34m 44s", "loss_scale": 1.0, "consumed_samples": 284416, "global_step/max_steps": "1111/6362"} +{"lm loss": 5.17867517, "grad_norm": 1.01718485, "learning_rate": 9.593e-05, "elapsed_time_per_iteration": 6.78894758, "memory(GiB)": 21.51, "elapsed_time": "2h 1m 42s", "remaining_time": "9h 34m 38s", "loss_scale": 1.0, "consumed_samples": 284672, "global_step/max_steps": "1112/6362"} +{"lm loss": 5.18736839, "grad_norm": 0.94548863, "learning_rate": 9.592e-05, "elapsed_time_per_iteration": 6.84649801, "memory(GiB)": 21.51, "elapsed_time": "2h 1m 49s", "remaining_time": "9h 34m 33s", "loss_scale": 1.0, "consumed_samples": 284928, "global_step/max_steps": "1113/6362"} +{"lm loss": 5.18325043, "grad_norm": 0.98596531, "learning_rate": 9.591e-05, "elapsed_time_per_iteration": 6.47846174, "memory(GiB)": 21.51, "elapsed_time": "2h 1m 56s", "remaining_time": "9h 34m 26s", "loss_scale": 1.0, "consumed_samples": 285184, "global_step/max_steps": "1114/6362"} +{"lm loss": 5.16476011, "grad_norm": 0.97568715, "learning_rate": 9.59e-05, "elapsed_time_per_iteration": 6.3886745, "memory(GiB)": 21.51, "elapsed_time": "2h 2m 2s", "remaining_time": "9h 34m 19s", "loss_scale": 1.0, "consumed_samples": 285440, "global_step/max_steps": "1115/6362"} +{"lm loss": 5.15533161, "grad_norm": 1.06101692, "learning_rate": 9.589e-05, "elapsed_time_per_iteration": 6.41216016, "memory(GiB)": 21.51, "elapsed_time": "2h 2m 9s", "remaining_time": "9h 34m 12s", "loss_scale": 1.0, "consumed_samples": 285696, "global_step/max_steps": "1116/6362"} +{"lm loss": 5.15681744, "grad_norm": 1.07675087, "learning_rate": 9.588e-05, "elapsed_time_per_iteration": 6.39531779, "memory(GiB)": 21.51, "elapsed_time": "2h 2m 15s", "remaining_time": "9h 34m 4s", "loss_scale": 1.0, "consumed_samples": 285952, "global_step/max_steps": "1117/6362"} +{"lm loss": 5.15686131, "grad_norm": 0.93648881, "learning_rate": 9.587e-05, "elapsed_time_per_iteration": 6.56571627, "memory(GiB)": 21.51, "elapsed_time": "2h 2m 22s", "remaining_time": "9h 33m 58s", "loss_scale": 1.0, "consumed_samples": 286208, "global_step/max_steps": "1118/6362"} +{"lm loss": 5.1762743, "grad_norm": 0.83362228, "learning_rate": 9.586e-05, "elapsed_time_per_iteration": 6.55758286, "memory(GiB)": 21.51, "elapsed_time": "2h 2m 28s", "remaining_time": "9h 33m 51s", "loss_scale": 1.0, "consumed_samples": 286464, "global_step/max_steps": "1119/6362"} +{"lm loss": 5.17157125, "grad_norm": 0.81524098, "learning_rate": 9.585e-05, "elapsed_time_per_iteration": 6.53221178, "memory(GiB)": 21.51, "elapsed_time": "2h 2m 35s", "remaining_time": "9h 33m 44s", "loss_scale": 1.0, "consumed_samples": 286720, "global_step/max_steps": "1120/6362"} +{"lm loss": 5.1767025, "grad_norm": 0.81257874, "learning_rate": 9.584e-05, "elapsed_time_per_iteration": 6.52513862, "memory(GiB)": 21.51, "elapsed_time": "2h 2m 41s", "remaining_time": "9h 33m 37s", "loss_scale": 1.0, "consumed_samples": 286976, "global_step/max_steps": "1121/6362"} +{"lm loss": 5.18578959, "grad_norm": 0.85353774, "learning_rate": 9.583e-05, "elapsed_time_per_iteration": 6.57034659, "memory(GiB)": 21.51, "elapsed_time": "2h 2m 48s", "remaining_time": "9h 33m 31s", "loss_scale": 1.0, "consumed_samples": 287232, "global_step/max_steps": "1122/6362"} +{"lm loss": 5.1651845, "grad_norm": 0.80912966, "learning_rate": 9.582e-05, "elapsed_time_per_iteration": 6.40818381, "memory(GiB)": 21.51, "elapsed_time": "2h 2m 54s", "remaining_time": "9h 33m 24s", "loss_scale": 1.0, "consumed_samples": 287488, "global_step/max_steps": "1123/6362"} +{"lm loss": 5.17056227, "grad_norm": 0.74760616, "learning_rate": 9.581e-05, "elapsed_time_per_iteration": 6.50439548, "memory(GiB)": 21.51, "elapsed_time": "2h 3m 1s", "remaining_time": "9h 33m 17s", "loss_scale": 1.0, "consumed_samples": 287744, "global_step/max_steps": "1124/6362"} +{"lm loss": 5.16111708, "grad_norm": 0.88228667, "learning_rate": 9.58e-05, "elapsed_time_per_iteration": 6.98804212, "memory(GiB)": 21.51, "elapsed_time": "2h 3m 8s", "remaining_time": "9h 33m 12s", "loss_scale": 1.0, "consumed_samples": 288000, "global_step/max_steps": "1125/6362"} +{"lm loss": 5.1718874, "grad_norm": 0.95151669, "learning_rate": 9.579e-05, "elapsed_time_per_iteration": 6.74513626, "memory(GiB)": 21.51, "elapsed_time": "2h 3m 14s", "remaining_time": "9h 33m 6s", "loss_scale": 1.0, "consumed_samples": 288256, "global_step/max_steps": "1126/6362"} +{"lm loss": 5.13122511, "grad_norm": 0.95594078, "learning_rate": 9.578e-05, "elapsed_time_per_iteration": 6.78504014, "memory(GiB)": 21.51, "elapsed_time": "2h 3m 21s", "remaining_time": "9h 33m 1s", "loss_scale": 1.0, "consumed_samples": 288512, "global_step/max_steps": "1127/6362"} +{"lm loss": 5.14398384, "grad_norm": 1.10869038, "learning_rate": 9.577e-05, "elapsed_time_per_iteration": 6.57170796, "memory(GiB)": 21.51, "elapsed_time": "2h 3m 28s", "remaining_time": "9h 32m 54s", "loss_scale": 1.0, "consumed_samples": 288768, "global_step/max_steps": "1128/6362"} +{"lm loss": 5.18207598, "grad_norm": 1.02996576, "learning_rate": 9.576e-05, "elapsed_time_per_iteration": 6.67936349, "memory(GiB)": 21.51, "elapsed_time": "2h 3m 34s", "remaining_time": "9h 32m 48s", "loss_scale": 1.0, "consumed_samples": 289024, "global_step/max_steps": "1129/6362"} +{"lm loss": 5.1488905, "grad_norm": 0.93238616, "learning_rate": 9.574e-05, "elapsed_time_per_iteration": 6.74382734, "memory(GiB)": 21.51, "elapsed_time": "2h 3m 41s", "remaining_time": "9h 32m 42s", "loss_scale": 1.0, "consumed_samples": 289280, "global_step/max_steps": "1130/6362"} +{"lm loss": 5.16325092, "grad_norm": 0.8963179, "learning_rate": 9.573e-05, "elapsed_time_per_iteration": 6.60330439, "memory(GiB)": 21.51, "elapsed_time": "2h 3m 48s", "remaining_time": "9h 32m 36s", "loss_scale": 1.0, "consumed_samples": 289536, "global_step/max_steps": "1131/6362"} +{"lm loss": 5.14434338, "grad_norm": 0.97826332, "learning_rate": 9.572e-05, "elapsed_time_per_iteration": 6.7482636, "memory(GiB)": 21.51, "elapsed_time": "2h 3m 55s", "remaining_time": "9h 32m 30s", "loss_scale": 1.0, "consumed_samples": 289792, "global_step/max_steps": "1132/6362"} +{"lm loss": 5.16708088, "grad_norm": 1.22545028, "learning_rate": 9.571e-05, "elapsed_time_per_iteration": 6.85250282, "memory(GiB)": 21.51, "elapsed_time": "2h 4m 1s", "remaining_time": "9h 32m 25s", "loss_scale": 1.0, "consumed_samples": 290048, "global_step/max_steps": "1133/6362"} +{"lm loss": 5.14875937, "grad_norm": 0.96383744, "learning_rate": 9.57e-05, "elapsed_time_per_iteration": 6.74863291, "memory(GiB)": 21.51, "elapsed_time": "2h 4m 8s", "remaining_time": "9h 32m 19s", "loss_scale": 1.0, "consumed_samples": 290304, "global_step/max_steps": "1134/6362"} +{"lm loss": 5.15580702, "grad_norm": 0.89467126, "learning_rate": 9.569e-05, "elapsed_time_per_iteration": 6.63939834, "memory(GiB)": 21.51, "elapsed_time": "2h 4m 15s", "remaining_time": "9h 32m 13s", "loss_scale": 1.0, "consumed_samples": 290560, "global_step/max_steps": "1135/6362"} +{"lm loss": 5.15495205, "grad_norm": 0.93431962, "learning_rate": 9.568e-05, "elapsed_time_per_iteration": 6.81624079, "memory(GiB)": 21.51, "elapsed_time": "2h 4m 22s", "remaining_time": "9h 32m 8s", "loss_scale": 1.0, "consumed_samples": 290816, "global_step/max_steps": "1136/6362"} +{"lm loss": 5.18111515, "grad_norm": 0.95242608, "learning_rate": 9.567e-05, "elapsed_time_per_iteration": 6.52143955, "memory(GiB)": 21.51, "elapsed_time": "2h 4m 28s", "remaining_time": "9h 32m 1s", "loss_scale": 1.0, "consumed_samples": 291072, "global_step/max_steps": "1137/6362"} +{"lm loss": 5.15690613, "grad_norm": 0.95186728, "learning_rate": 9.566e-05, "elapsed_time_per_iteration": 6.7527523, "memory(GiB)": 21.51, "elapsed_time": "2h 4m 35s", "remaining_time": "9h 31m 55s", "loss_scale": 1.0, "consumed_samples": 291328, "global_step/max_steps": "1138/6362"} +{"lm loss": 5.1565876, "grad_norm": 0.96595889, "learning_rate": 9.565e-05, "elapsed_time_per_iteration": 6.57399321, "memory(GiB)": 21.51, "elapsed_time": "2h 4m 41s", "remaining_time": "9h 31m 49s", "loss_scale": 1.0, "consumed_samples": 291584, "global_step/max_steps": "1139/6362"} +{"lm loss": 5.15897179, "grad_norm": 0.8289538, "learning_rate": 9.564e-05, "elapsed_time_per_iteration": 6.69061923, "memory(GiB)": 21.51, "elapsed_time": "2h 4m 48s", "remaining_time": "9h 31m 43s", "loss_scale": 1.0, "consumed_samples": 291840, "global_step/max_steps": "1140/6362"} +{"lm loss": 5.16703224, "grad_norm": 0.74695605, "learning_rate": 9.563e-05, "elapsed_time_per_iteration": 6.72235155, "memory(GiB)": 21.51, "elapsed_time": "2h 4m 55s", "remaining_time": "9h 31m 37s", "loss_scale": 1.0, "consumed_samples": 292096, "global_step/max_steps": "1141/6362"} +{"lm loss": 5.16267061, "grad_norm": 1.06038523, "learning_rate": 9.562e-05, "elapsed_time_per_iteration": 6.65111995, "memory(GiB)": 21.51, "elapsed_time": "2h 5m 1s", "remaining_time": "9h 31m 31s", "loss_scale": 1.0, "consumed_samples": 292352, "global_step/max_steps": "1142/6362"} +{"lm loss": 5.16671324, "grad_norm": 1.00442171, "learning_rate": 9.561e-05, "elapsed_time_per_iteration": 6.80549479, "memory(GiB)": 21.51, "elapsed_time": "2h 5m 8s", "remaining_time": "9h 31m 25s", "loss_scale": 1.0, "consumed_samples": 292608, "global_step/max_steps": "1143/6362"} +{"lm loss": 5.14755821, "grad_norm": 1.02783477, "learning_rate": 9.56e-05, "elapsed_time_per_iteration": 6.74903417, "memory(GiB)": 21.51, "elapsed_time": "2h 5m 15s", "remaining_time": "9h 31m 19s", "loss_scale": 1.0, "consumed_samples": 292864, "global_step/max_steps": "1144/6362"} +{"lm loss": 5.17871857, "grad_norm": 1.18573856, "learning_rate": 9.559e-05, "elapsed_time_per_iteration": 6.42883801, "memory(GiB)": 21.51, "elapsed_time": "2h 5m 21s", "remaining_time": "9h 31m 12s", "loss_scale": 1.0, "consumed_samples": 293120, "global_step/max_steps": "1145/6362"} +{"lm loss": 5.15147734, "grad_norm": 0.87338811, "learning_rate": 9.558e-05, "elapsed_time_per_iteration": 6.3742609, "memory(GiB)": 21.51, "elapsed_time": "2h 5m 28s", "remaining_time": "9h 31m 5s", "loss_scale": 1.0, "consumed_samples": 293376, "global_step/max_steps": "1146/6362"} +{"lm loss": 5.18215656, "grad_norm": 0.88044608, "learning_rate": 9.557e-05, "elapsed_time_per_iteration": 6.74830985, "memory(GiB)": 21.51, "elapsed_time": "2h 5m 35s", "remaining_time": "9h 30m 59s", "loss_scale": 1.0, "consumed_samples": 293632, "global_step/max_steps": "1147/6362"} +{"lm loss": 5.15628433, "grad_norm": 0.9921357, "learning_rate": 9.556e-05, "elapsed_time_per_iteration": 6.52975368, "memory(GiB)": 21.51, "elapsed_time": "2h 5m 41s", "remaining_time": "9h 30m 52s", "loss_scale": 1.0, "consumed_samples": 293888, "global_step/max_steps": "1148/6362"} +{"lm loss": 5.16215086, "grad_norm": 1.14268136, "learning_rate": 9.555e-05, "elapsed_time_per_iteration": 6.63466501, "memory(GiB)": 21.51, "elapsed_time": "2h 5m 48s", "remaining_time": "9h 30m 46s", "loss_scale": 1.0, "consumed_samples": 294144, "global_step/max_steps": "1149/6362"} +{"lm loss": 5.16356802, "grad_norm": 1.10989332, "learning_rate": 9.554e-05, "elapsed_time_per_iteration": 6.82556295, "memory(GiB)": 21.51, "elapsed_time": "2h 5m 55s", "remaining_time": "9h 30m 40s", "loss_scale": 1.0, "consumed_samples": 294400, "global_step/max_steps": "1150/6362"} +{"lm loss": 5.15645266, "grad_norm": 0.95186371, "learning_rate": 9.553e-05, "elapsed_time_per_iteration": 6.58839035, "memory(GiB)": 21.51, "elapsed_time": "2h 6m 1s", "remaining_time": "9h 30m 34s", "loss_scale": 1.0, "consumed_samples": 294656, "global_step/max_steps": "1151/6362"} +{"lm loss": 5.16680384, "grad_norm": 1.20840502, "learning_rate": 9.551e-05, "elapsed_time_per_iteration": 6.78154826, "memory(GiB)": 21.51, "elapsed_time": "2h 6m 8s", "remaining_time": "9h 30m 28s", "loss_scale": 1.0, "consumed_samples": 294912, "global_step/max_steps": "1152/6362"} +{"lm loss": 5.15683413, "grad_norm": 0.93245679, "learning_rate": 9.55e-05, "elapsed_time_per_iteration": 6.59212041, "memory(GiB)": 21.51, "elapsed_time": "2h 6m 15s", "remaining_time": "9h 30m 22s", "loss_scale": 1.0, "consumed_samples": 295168, "global_step/max_steps": "1153/6362"} +{"lm loss": 5.15364122, "grad_norm": 0.86413217, "learning_rate": 9.549e-05, "elapsed_time_per_iteration": 6.5595293, "memory(GiB)": 21.51, "elapsed_time": "2h 6m 21s", "remaining_time": "9h 30m 15s", "loss_scale": 1.0, "consumed_samples": 295424, "global_step/max_steps": "1154/6362"} +{"lm loss": 5.16178799, "grad_norm": 0.92084593, "learning_rate": 9.548e-05, "elapsed_time_per_iteration": 6.4880693, "memory(GiB)": 21.51, "elapsed_time": "2h 6m 28s", "remaining_time": "9h 30m 8s", "loss_scale": 1.0, "consumed_samples": 295680, "global_step/max_steps": "1155/6362"} +{"lm loss": 5.15224457, "grad_norm": 0.96740663, "learning_rate": 9.547e-05, "elapsed_time_per_iteration": 6.632339, "memory(GiB)": 21.51, "elapsed_time": "2h 6m 34s", "remaining_time": "9h 30m 2s", "loss_scale": 1.0, "consumed_samples": 295936, "global_step/max_steps": "1156/6362"} +{"lm loss": 5.15324259, "grad_norm": 1.04484653, "learning_rate": 9.546e-05, "elapsed_time_per_iteration": 6.63447666, "memory(GiB)": 21.51, "elapsed_time": "2h 6m 41s", "remaining_time": "9h 29m 56s", "loss_scale": 1.0, "consumed_samples": 296192, "global_step/max_steps": "1157/6362"} +{"lm loss": 5.15780401, "grad_norm": 1.13761282, "learning_rate": 9.545e-05, "elapsed_time_per_iteration": 6.44399834, "memory(GiB)": 21.51, "elapsed_time": "2h 6m 47s", "remaining_time": "9h 29m 49s", "loss_scale": 1.0, "consumed_samples": 296448, "global_step/max_steps": "1158/6362"} +{"lm loss": 5.14951086, "grad_norm": 0.84276295, "learning_rate": 9.544e-05, "elapsed_time_per_iteration": 6.44664693, "memory(GiB)": 21.51, "elapsed_time": "2h 6m 54s", "remaining_time": "9h 29m 42s", "loss_scale": 1.0, "consumed_samples": 296704, "global_step/max_steps": "1159/6362"} +{"lm loss": 5.16853905, "grad_norm": 0.87891418, "learning_rate": 9.543e-05, "elapsed_time_per_iteration": 6.45884109, "memory(GiB)": 21.51, "elapsed_time": "2h 7m 0s", "remaining_time": "9h 29m 34s", "loss_scale": 1.0, "consumed_samples": 296960, "global_step/max_steps": "1160/6362"} +{"lm loss": 5.16188335, "grad_norm": 0.9290278, "learning_rate": 9.542e-05, "elapsed_time_per_iteration": 6.58698344, "memory(GiB)": 21.51, "elapsed_time": "2h 7m 7s", "remaining_time": "9h 29m 28s", "loss_scale": 1.0, "consumed_samples": 297216, "global_step/max_steps": "1161/6362"} +{"lm loss": 5.1437726, "grad_norm": 0.98619211, "learning_rate": 9.541e-05, "elapsed_time_per_iteration": 6.68965626, "memory(GiB)": 21.51, "elapsed_time": "2h 7m 13s", "remaining_time": "9h 29m 22s", "loss_scale": 1.0, "consumed_samples": 297472, "global_step/max_steps": "1162/6362"} +{"lm loss": 5.16648483, "grad_norm": 0.91524887, "learning_rate": 9.54e-05, "elapsed_time_per_iteration": 6.64396358, "memory(GiB)": 21.51, "elapsed_time": "2h 7m 20s", "remaining_time": "9h 29m 16s", "loss_scale": 1.0, "consumed_samples": 297728, "global_step/max_steps": "1163/6362"} +{"lm loss": 5.15228653, "grad_norm": 1.03332949, "learning_rate": 9.539e-05, "elapsed_time_per_iteration": 6.79231548, "memory(GiB)": 21.51, "elapsed_time": "2h 7m 27s", "remaining_time": "9h 29m 10s", "loss_scale": 1.0, "consumed_samples": 297984, "global_step/max_steps": "1164/6362"} +{"lm loss": 5.14531565, "grad_norm": 0.92709452, "learning_rate": 9.538e-05, "elapsed_time_per_iteration": 6.50419283, "memory(GiB)": 21.51, "elapsed_time": "2h 7m 33s", "remaining_time": "9h 29m 3s", "loss_scale": 1.0, "consumed_samples": 298240, "global_step/max_steps": "1165/6362"} +{"lm loss": 5.15296555, "grad_norm": 0.88689953, "learning_rate": 9.537e-05, "elapsed_time_per_iteration": 6.64577436, "memory(GiB)": 21.51, "elapsed_time": "2h 7m 40s", "remaining_time": "9h 28m 57s", "loss_scale": 1.0, "consumed_samples": 298496, "global_step/max_steps": "1166/6362"} +{"lm loss": 5.1542325, "grad_norm": 0.93133169, "learning_rate": 9.535e-05, "elapsed_time_per_iteration": 6.54732704, "memory(GiB)": 21.51, "elapsed_time": "2h 7m 47s", "remaining_time": "9h 28m 50s", "loss_scale": 1.0, "consumed_samples": 298752, "global_step/max_steps": "1167/6362"} +{"lm loss": 5.14714956, "grad_norm": 0.9643482, "learning_rate": 9.534e-05, "elapsed_time_per_iteration": 6.60978746, "memory(GiB)": 21.51, "elapsed_time": "2h 7m 53s", "remaining_time": "9h 28m 44s", "loss_scale": 1.0, "consumed_samples": 299008, "global_step/max_steps": "1168/6362"} +{"lm loss": 5.16022921, "grad_norm": 0.99335974, "learning_rate": 9.533e-05, "elapsed_time_per_iteration": 6.43562913, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 0s", "remaining_time": "9h 28m 37s", "loss_scale": 1.0, "consumed_samples": 299264, "global_step/max_steps": "1169/6362"} +{"lm loss": 5.13568878, "grad_norm": 1.00089133, "learning_rate": 9.532e-05, "elapsed_time_per_iteration": 6.69983006, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 6s", "remaining_time": "9h 28m 31s", "loss_scale": 1.0, "consumed_samples": 299520, "global_step/max_steps": "1170/6362"} +{"lm loss": 5.16387033, "grad_norm": 1.03568172, "learning_rate": 9.531e-05, "elapsed_time_per_iteration": 6.47407365, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 13s", "remaining_time": "9h 28m 24s", "loss_scale": 1.0, "consumed_samples": 299776, "global_step/max_steps": "1171/6362"} +{"lm loss": 5.16310501, "grad_norm": 1.01062548, "learning_rate": 9.53e-05, "elapsed_time_per_iteration": 6.65781093, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 19s", "remaining_time": "9h 28m 18s", "loss_scale": 1.0, "consumed_samples": 300032, "global_step/max_steps": "1172/6362"} +{"lm loss": 5.1739006, "grad_norm": 1.01305819, "learning_rate": 9.529e-05, "elapsed_time_per_iteration": 6.53059196, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 26s", "remaining_time": "9h 28m 11s", "loss_scale": 1.0, "consumed_samples": 300288, "global_step/max_steps": "1173/6362"} +{"lm loss": 5.1653223, "grad_norm": 0.99656993, "learning_rate": 9.528e-05, "elapsed_time_per_iteration": 6.78338623, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 33s", "remaining_time": "9h 28m 5s", "loss_scale": 1.0, "consumed_samples": 300544, "global_step/max_steps": "1174/6362"} +{"lm loss": 5.12037754, "grad_norm": 1.03784847, "learning_rate": 9.527e-05, "elapsed_time_per_iteration": 6.48615408, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 39s", "remaining_time": "9h 27m 58s", "loss_scale": 1.0, "consumed_samples": 300800, "global_step/max_steps": "1175/6362"} +{"lm loss": 5.15277243, "grad_norm": 0.90272707, "learning_rate": 9.526e-05, "elapsed_time_per_iteration": 6.79356027, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 46s", "remaining_time": "9h 27m 53s", "loss_scale": 1.0, "consumed_samples": 301056, "global_step/max_steps": "1176/6362"} +{"lm loss": 5.14996004, "grad_norm": 0.90756977, "learning_rate": 9.525e-05, "elapsed_time_per_iteration": 6.52941561, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 53s", "remaining_time": "9h 27m 46s", "loss_scale": 1.0, "consumed_samples": 301312, "global_step/max_steps": "1177/6362"} +{"lm loss": 5.1652317, "grad_norm": 0.90199298, "learning_rate": 9.524e-05, "elapsed_time_per_iteration": 6.53552055, "memory(GiB)": 21.51, "elapsed_time": "2h 8m 59s", "remaining_time": "9h 27m 39s", "loss_scale": 1.0, "consumed_samples": 301568, "global_step/max_steps": "1178/6362"} +{"lm loss": 5.12541723, "grad_norm": 0.89378417, "learning_rate": 9.522e-05, "elapsed_time_per_iteration": 6.53913331, "memory(GiB)": 21.51, "elapsed_time": "2h 9m 6s", "remaining_time": "9h 27m 33s", "loss_scale": 1.0, "consumed_samples": 301824, "global_step/max_steps": "1179/6362"} +{"lm loss": 5.15161276, "grad_norm": 0.98239017, "learning_rate": 9.521e-05, "elapsed_time_per_iteration": 6.54931617, "memory(GiB)": 21.51, "elapsed_time": "2h 9m 12s", "remaining_time": "9h 27m 26s", "loss_scale": 1.0, "consumed_samples": 302080, "global_step/max_steps": "1180/6362"} +{"lm loss": 5.15413952, "grad_norm": 0.94034934, "learning_rate": 9.52e-05, "elapsed_time_per_iteration": 6.56522226, "memory(GiB)": 21.51, "elapsed_time": "2h 9m 19s", "remaining_time": "9h 27m 19s", "loss_scale": 1.0, "consumed_samples": 302336, "global_step/max_steps": "1181/6362"} +{"lm loss": 5.1591711, "grad_norm": 0.82471251, "learning_rate": 9.519e-05, "elapsed_time_per_iteration": 6.52133679, "memory(GiB)": 21.51, "elapsed_time": "2h 9m 25s", "remaining_time": "9h 27m 13s", "loss_scale": 1.0, "consumed_samples": 302592, "global_step/max_steps": "1182/6362"} +{"lm loss": 5.14711857, "grad_norm": 0.81366456, "learning_rate": 9.518e-05, "elapsed_time_per_iteration": 6.47112393, "memory(GiB)": 21.51, "elapsed_time": "2h 9m 32s", "remaining_time": "9h 27m 5s", "loss_scale": 1.0, "consumed_samples": 302848, "global_step/max_steps": "1183/6362"} +{"lm loss": 5.15997934, "grad_norm": 1.0251838, "learning_rate": 9.517e-05, "elapsed_time_per_iteration": 6.50077367, "memory(GiB)": 21.51, "elapsed_time": "2h 9m 38s", "remaining_time": "9h 26m 59s", "loss_scale": 1.0, "consumed_samples": 303104, "global_step/max_steps": "1184/6362"} +{"lm loss": 5.15508461, "grad_norm": 1.21801817, "learning_rate": 9.516e-05, "elapsed_time_per_iteration": 6.48408079, "memory(GiB)": 21.51, "elapsed_time": "2h 9m 45s", "remaining_time": "9h 26m 52s", "loss_scale": 1.0, "consumed_samples": 303360, "global_step/max_steps": "1185/6362"} +{"lm loss": 5.13777065, "grad_norm": 0.80524397, "learning_rate": 9.515e-05, "elapsed_time_per_iteration": 6.62173033, "memory(GiB)": 21.51, "elapsed_time": "2h 9m 51s", "remaining_time": "9h 26m 45s", "loss_scale": 1.0, "consumed_samples": 303616, "global_step/max_steps": "1186/6362"} +{"lm loss": 5.14741182, "grad_norm": 0.76144338, "learning_rate": 9.514e-05, "elapsed_time_per_iteration": 6.50857639, "memory(GiB)": 21.51, "elapsed_time": "2h 9m 58s", "remaining_time": "9h 26m 38s", "loss_scale": 1.0, "consumed_samples": 303872, "global_step/max_steps": "1187/6362"} +{"lm loss": 5.13172579, "grad_norm": 0.8308388, "learning_rate": 9.513e-05, "elapsed_time_per_iteration": 6.50722432, "memory(GiB)": 21.51, "elapsed_time": "2h 10m 4s", "remaining_time": "9h 26m 32s", "loss_scale": 1.0, "consumed_samples": 304128, "global_step/max_steps": "1188/6362"} +{"lm loss": 5.11864519, "grad_norm": 0.9317975, "learning_rate": 9.511e-05, "elapsed_time_per_iteration": 6.56782746, "memory(GiB)": 21.51, "elapsed_time": "2h 10m 11s", "remaining_time": "9h 26m 25s", "loss_scale": 1.0, "consumed_samples": 304384, "global_step/max_steps": "1189/6362"} +{"lm loss": 5.14667988, "grad_norm": 0.95674258, "learning_rate": 9.51e-05, "elapsed_time_per_iteration": 6.41599512, "memory(GiB)": 21.51, "elapsed_time": "2h 10m 17s", "remaining_time": "9h 26m 18s", "loss_scale": 1.0, "consumed_samples": 304640, "global_step/max_steps": "1190/6362"} +{"lm loss": 5.14187574, "grad_norm": 0.93473017, "learning_rate": 9.509e-05, "elapsed_time_per_iteration": 6.42321277, "memory(GiB)": 21.51, "elapsed_time": "2h 10m 24s", "remaining_time": "9h 26m 11s", "loss_scale": 1.0, "consumed_samples": 304896, "global_step/max_steps": "1191/6362"} +{"lm loss": 5.14681673, "grad_norm": 0.80646408, "learning_rate": 9.508e-05, "elapsed_time_per_iteration": 6.86619878, "memory(GiB)": 21.51, "elapsed_time": "2h 10m 31s", "remaining_time": "9h 26m 5s", "loss_scale": 1.0, "consumed_samples": 305152, "global_step/max_steps": "1192/6362"} +{"lm loss": 5.16967201, "grad_norm": 0.83154374, "learning_rate": 9.507e-05, "elapsed_time_per_iteration": 6.59017205, "memory(GiB)": 21.51, "elapsed_time": "2h 10m 37s", "remaining_time": "9h 25m 59s", "loss_scale": 1.0, "consumed_samples": 305408, "global_step/max_steps": "1193/6362"} +{"lm loss": 5.14146328, "grad_norm": 0.88046783, "learning_rate": 9.506e-05, "elapsed_time_per_iteration": 6.56841087, "memory(GiB)": 21.51, "elapsed_time": "2h 10m 44s", "remaining_time": "9h 25m 52s", "loss_scale": 1.0, "consumed_samples": 305664, "global_step/max_steps": "1194/6362"} +{"lm loss": 5.12819386, "grad_norm": 1.03178728, "learning_rate": 9.505e-05, "elapsed_time_per_iteration": 6.6865654, "memory(GiB)": 21.51, "elapsed_time": "2h 10m 51s", "remaining_time": "9h 25m 46s", "loss_scale": 1.0, "consumed_samples": 305920, "global_step/max_steps": "1195/6362"} +{"lm loss": 5.15671921, "grad_norm": 1.07133257, "learning_rate": 9.504e-05, "elapsed_time_per_iteration": 6.56153107, "memory(GiB)": 21.51, "elapsed_time": "2h 10m 57s", "remaining_time": "9h 25m 40s", "loss_scale": 1.0, "consumed_samples": 306176, "global_step/max_steps": "1196/6362"} +{"lm loss": 5.17170715, "grad_norm": 0.9863534, "learning_rate": 9.503e-05, "elapsed_time_per_iteration": 6.41429973, "memory(GiB)": 21.51, "elapsed_time": "2h 11m 4s", "remaining_time": "9h 25m 32s", "loss_scale": 1.0, "consumed_samples": 306432, "global_step/max_steps": "1197/6362"} +{"lm loss": 5.15919971, "grad_norm": 1.05966008, "learning_rate": 9.502e-05, "elapsed_time_per_iteration": 6.79501677, "memory(GiB)": 21.51, "elapsed_time": "2h 11m 10s", "remaining_time": "9h 25m 27s", "loss_scale": 1.0, "consumed_samples": 306688, "global_step/max_steps": "1198/6362"} +{"lm loss": 5.15653038, "grad_norm": 1.02995026, "learning_rate": 9.5e-05, "elapsed_time_per_iteration": 6.41200447, "memory(GiB)": 21.51, "elapsed_time": "2h 11m 17s", "remaining_time": "9h 25m 20s", "loss_scale": 1.0, "consumed_samples": 306944, "global_step/max_steps": "1199/6362"} +{"lm loss": 5.13966894, "grad_norm": 0.87788689, "learning_rate": 9.499e-05, "elapsed_time_per_iteration": 6.506145, "memory(GiB)": 21.51, "elapsed_time": "2h 11m 23s", "remaining_time": "9h 25m 13s", "loss_scale": 1.0, "consumed_samples": 307200, "global_step/max_steps": "1200/6362"} +{"lm loss": 5.13075829, "grad_norm": 0.87846762, "learning_rate": 9.498e-05, "elapsed_time_per_iteration": 6.682446, "memory(GiB)": 21.51, "elapsed_time": "2h 11m 30s", "remaining_time": "9h 25m 7s", "loss_scale": 1.0, "consumed_samples": 307456, "global_step/max_steps": "1201/6362"} +{"lm loss": 5.1428442, "grad_norm": 0.92220551, "learning_rate": 9.497e-05, "elapsed_time_per_iteration": 6.51808977, "memory(GiB)": 21.51, "elapsed_time": "2h 11m 36s", "remaining_time": "9h 25m 0s", "loss_scale": 1.0, "consumed_samples": 307712, "global_step/max_steps": "1202/6362"} +{"lm loss": 5.14146757, "grad_norm": 0.84091437, "learning_rate": 9.496e-05, "elapsed_time_per_iteration": 6.54684091, "memory(GiB)": 21.51, "elapsed_time": "2h 11m 43s", "remaining_time": "9h 24m 53s", "loss_scale": 1.0, "consumed_samples": 307968, "global_step/max_steps": "1203/6362"} +{"lm loss": 5.13430977, "grad_norm": 0.91461462, "learning_rate": 9.495e-05, "elapsed_time_per_iteration": 6.80108523, "memory(GiB)": 21.51, "elapsed_time": "2h 11m 50s", "remaining_time": "9h 24m 48s", "loss_scale": 1.0, "consumed_samples": 308224, "global_step/max_steps": "1204/6362"} +{"lm loss": 5.14757299, "grad_norm": 1.03568411, "learning_rate": 9.494e-05, "elapsed_time_per_iteration": 6.78740835, "memory(GiB)": 21.51, "elapsed_time": "2h 11m 57s", "remaining_time": "9h 24m 42s", "loss_scale": 1.0, "consumed_samples": 308480, "global_step/max_steps": "1205/6362"} +{"lm loss": 5.13945913, "grad_norm": 0.9921242, "learning_rate": 9.493e-05, "elapsed_time_per_iteration": 6.84121561, "memory(GiB)": 21.51, "elapsed_time": "2h 12m 3s", "remaining_time": "9h 24m 37s", "loss_scale": 1.0, "consumed_samples": 308736, "global_step/max_steps": "1206/6362"} +{"lm loss": 5.13248158, "grad_norm": 0.95936096, "learning_rate": 9.491e-05, "elapsed_time_per_iteration": 6.89293051, "memory(GiB)": 21.51, "elapsed_time": "2h 12m 10s", "remaining_time": "9h 24m 31s", "loss_scale": 1.0, "consumed_samples": 308992, "global_step/max_steps": "1207/6362"} +{"lm loss": 5.13118124, "grad_norm": 0.83806974, "learning_rate": 9.49e-05, "elapsed_time_per_iteration": 6.72821403, "memory(GiB)": 21.51, "elapsed_time": "2h 12m 17s", "remaining_time": "9h 24m 25s", "loss_scale": 1.0, "consumed_samples": 309248, "global_step/max_steps": "1208/6362"} +{"lm loss": 5.12951326, "grad_norm": 0.8491478, "learning_rate": 9.489e-05, "elapsed_time_per_iteration": 6.7360003, "memory(GiB)": 21.51, "elapsed_time": "2h 12m 24s", "remaining_time": "9h 24m 20s", "loss_scale": 1.0, "consumed_samples": 309504, "global_step/max_steps": "1209/6362"} +{"lm loss": 5.12802982, "grad_norm": 0.94290102, "learning_rate": 9.488e-05, "elapsed_time_per_iteration": 6.55394554, "memory(GiB)": 21.51, "elapsed_time": "2h 12m 30s", "remaining_time": "9h 24m 13s", "loss_scale": 1.0, "consumed_samples": 309760, "global_step/max_steps": "1210/6362"} +{"lm loss": 5.14449263, "grad_norm": 1.09029317, "learning_rate": 9.487e-05, "elapsed_time_per_iteration": 6.71413088, "memory(GiB)": 21.51, "elapsed_time": "2h 12m 37s", "remaining_time": "9h 24m 7s", "loss_scale": 1.0, "consumed_samples": 310016, "global_step/max_steps": "1211/6362"} +{"lm loss": 5.14394712, "grad_norm": 1.02533424, "learning_rate": 9.486e-05, "elapsed_time_per_iteration": 6.83811188, "memory(GiB)": 21.51, "elapsed_time": "2h 12m 44s", "remaining_time": "9h 24m 2s", "loss_scale": 1.0, "consumed_samples": 310272, "global_step/max_steps": "1212/6362"} +{"lm loss": 5.12732792, "grad_norm": 1.02628171, "learning_rate": 9.485e-05, "elapsed_time_per_iteration": 6.66201997, "memory(GiB)": 21.51, "elapsed_time": "2h 12m 51s", "remaining_time": "9h 23m 55s", "loss_scale": 1.0, "consumed_samples": 310528, "global_step/max_steps": "1213/6362"} +{"lm loss": 5.13366604, "grad_norm": 1.014938, "learning_rate": 9.484e-05, "elapsed_time_per_iteration": 6.8222878, "memory(GiB)": 21.51, "elapsed_time": "2h 12m 57s", "remaining_time": "9h 23m 50s", "loss_scale": 1.0, "consumed_samples": 310784, "global_step/max_steps": "1214/6362"} +{"lm loss": 5.12389231, "grad_norm": 1.00931859, "learning_rate": 9.482e-05, "elapsed_time_per_iteration": 6.5197401, "memory(GiB)": 21.51, "elapsed_time": "2h 13m 4s", "remaining_time": "9h 23m 43s", "loss_scale": 1.0, "consumed_samples": 311040, "global_step/max_steps": "1215/6362"} +{"lm loss": 5.14205122, "grad_norm": 0.88409716, "learning_rate": 9.481e-05, "elapsed_time_per_iteration": 6.47736955, "memory(GiB)": 21.51, "elapsed_time": "2h 13m 10s", "remaining_time": "9h 23m 36s", "loss_scale": 1.0, "consumed_samples": 311296, "global_step/max_steps": "1216/6362"} +{"lm loss": 5.14833307, "grad_norm": 0.7848174, "learning_rate": 9.48e-05, "elapsed_time_per_iteration": 6.47017455, "memory(GiB)": 21.51, "elapsed_time": "2h 13m 17s", "remaining_time": "9h 23m 29s", "loss_scale": 1.0, "consumed_samples": 311552, "global_step/max_steps": "1217/6362"} +{"lm loss": 5.15203094, "grad_norm": 0.94721764, "learning_rate": 9.479e-05, "elapsed_time_per_iteration": 6.41192985, "memory(GiB)": 21.51, "elapsed_time": "2h 13m 23s", "remaining_time": "9h 23m 22s", "loss_scale": 1.0, "consumed_samples": 311808, "global_step/max_steps": "1218/6362"} +{"lm loss": 5.13864851, "grad_norm": 1.01819134, "learning_rate": 9.478e-05, "elapsed_time_per_iteration": 6.70355105, "memory(GiB)": 21.51, "elapsed_time": "2h 13m 30s", "remaining_time": "9h 23m 16s", "loss_scale": 1.0, "consumed_samples": 312064, "global_step/max_steps": "1219/6362"} +{"lm loss": 5.14579821, "grad_norm": 0.96339291, "learning_rate": 9.477e-05, "elapsed_time_per_iteration": 6.42977667, "memory(GiB)": 21.51, "elapsed_time": "2h 13m 36s", "remaining_time": "9h 23m 9s", "loss_scale": 1.0, "consumed_samples": 312320, "global_step/max_steps": "1220/6362"} +{"lm loss": 5.13308001, "grad_norm": 0.79872882, "learning_rate": 9.476e-05, "elapsed_time_per_iteration": 6.57368898, "memory(GiB)": 21.51, "elapsed_time": "2h 13m 43s", "remaining_time": "9h 23m 2s", "loss_scale": 1.0, "consumed_samples": 312576, "global_step/max_steps": "1221/6362"} +{"lm loss": 5.13537073, "grad_norm": 0.93036449, "learning_rate": 9.474e-05, "elapsed_time_per_iteration": 6.71899867, "memory(GiB)": 21.51, "elapsed_time": "2h 13m 50s", "remaining_time": "9h 22m 56s", "loss_scale": 1.0, "consumed_samples": 312832, "global_step/max_steps": "1222/6362"} +{"lm loss": 5.14025021, "grad_norm": 0.95158124, "learning_rate": 9.473e-05, "elapsed_time_per_iteration": 6.39408565, "memory(GiB)": 21.51, "elapsed_time": "2h 13m 56s", "remaining_time": "9h 22m 49s", "loss_scale": 1.0, "consumed_samples": 313088, "global_step/max_steps": "1223/6362"} +{"lm loss": 5.12573433, "grad_norm": 0.79483068, "learning_rate": 9.472e-05, "elapsed_time_per_iteration": 6.43434334, "memory(GiB)": 21.51, "elapsed_time": "2h 14m 2s", "remaining_time": "9h 22m 42s", "loss_scale": 1.0, "consumed_samples": 313344, "global_step/max_steps": "1224/6362"} +{"lm loss": 5.12815714, "grad_norm": 0.72479457, "learning_rate": 9.471e-05, "elapsed_time_per_iteration": 6.58817816, "memory(GiB)": 21.51, "elapsed_time": "2h 14m 9s", "remaining_time": "9h 22m 35s", "loss_scale": 1.0, "consumed_samples": 313600, "global_step/max_steps": "1225/6362"} +{"lm loss": 5.13574076, "grad_norm": 0.79845417, "learning_rate": 9.47e-05, "elapsed_time_per_iteration": 6.51298165, "memory(GiB)": 21.51, "elapsed_time": "2h 14m 16s", "remaining_time": "9h 22m 28s", "loss_scale": 1.0, "consumed_samples": 313856, "global_step/max_steps": "1226/6362"} +{"lm loss": 5.12657261, "grad_norm": 0.955589, "learning_rate": 9.469e-05, "elapsed_time_per_iteration": 6.70494246, "memory(GiB)": 21.51, "elapsed_time": "2h 14m 22s", "remaining_time": "9h 22m 22s", "loss_scale": 1.0, "consumed_samples": 314112, "global_step/max_steps": "1227/6362"} +{"lm loss": 5.1229744, "grad_norm": 0.93076676, "learning_rate": 9.468e-05, "elapsed_time_per_iteration": 6.63185859, "memory(GiB)": 21.51, "elapsed_time": "2h 14m 29s", "remaining_time": "9h 22m 16s", "loss_scale": 1.0, "consumed_samples": 314368, "global_step/max_steps": "1228/6362"} +{"lm loss": 5.1539526, "grad_norm": 1.0021044, "learning_rate": 9.466e-05, "elapsed_time_per_iteration": 6.84669113, "memory(GiB)": 21.51, "elapsed_time": "2h 14m 36s", "remaining_time": "9h 22m 11s", "loss_scale": 1.0, "consumed_samples": 314624, "global_step/max_steps": "1229/6362"} +{"lm loss": 5.14308071, "grad_norm": 1.19123459, "learning_rate": 9.465e-05, "elapsed_time_per_iteration": 6.57390594, "memory(GiB)": 21.51, "elapsed_time": "2h 14m 42s", "remaining_time": "9h 22m 4s", "loss_scale": 1.0, "consumed_samples": 314880, "global_step/max_steps": "1230/6362"} +{"lm loss": 5.14805841, "grad_norm": 0.7963171, "learning_rate": 9.464e-05, "elapsed_time_per_iteration": 6.64103103, "memory(GiB)": 21.51, "elapsed_time": "2h 14m 49s", "remaining_time": "9h 21m 58s", "loss_scale": 1.0, "consumed_samples": 315136, "global_step/max_steps": "1231/6362"} +{"lm loss": 5.146101, "grad_norm": 0.80870867, "learning_rate": 9.463e-05, "elapsed_time_per_iteration": 6.79548407, "memory(GiB)": 21.51, "elapsed_time": "2h 14m 56s", "remaining_time": "9h 21m 52s", "loss_scale": 1.0, "consumed_samples": 315392, "global_step/max_steps": "1232/6362"} +{"lm loss": 5.13543892, "grad_norm": 0.85520703, "learning_rate": 9.462e-05, "elapsed_time_per_iteration": 6.92876744, "memory(GiB)": 21.51, "elapsed_time": "2h 15m 3s", "remaining_time": "9h 21m 47s", "loss_scale": 1.0, "consumed_samples": 315648, "global_step/max_steps": "1233/6362"} +{"lm loss": 5.13807487, "grad_norm": 1.05509007, "learning_rate": 9.461e-05, "elapsed_time_per_iteration": 6.56571245, "memory(GiB)": 21.51, "elapsed_time": "2h 15m 9s", "remaining_time": "9h 21m 40s", "loss_scale": 1.0, "consumed_samples": 315904, "global_step/max_steps": "1234/6362"} +{"lm loss": 5.13791561, "grad_norm": 0.99159288, "learning_rate": 9.46e-05, "elapsed_time_per_iteration": 6.76320171, "memory(GiB)": 21.51, "elapsed_time": "2h 15m 16s", "remaining_time": "9h 21m 35s", "loss_scale": 1.0, "consumed_samples": 316160, "global_step/max_steps": "1235/6362"} +{"lm loss": 5.15622425, "grad_norm": 0.92492765, "learning_rate": 9.458e-05, "elapsed_time_per_iteration": 6.68279958, "memory(GiB)": 21.51, "elapsed_time": "2h 15m 23s", "remaining_time": "9h 21m 29s", "loss_scale": 1.0, "consumed_samples": 316416, "global_step/max_steps": "1236/6362"} +{"lm loss": 5.14339018, "grad_norm": 0.87723881, "learning_rate": 9.457e-05, "elapsed_time_per_iteration": 6.54107046, "memory(GiB)": 21.51, "elapsed_time": "2h 15m 29s", "remaining_time": "9h 21m 22s", "loss_scale": 1.0, "consumed_samples": 316672, "global_step/max_steps": "1237/6362"} +{"lm loss": 5.12843513, "grad_norm": 1.11561632, "learning_rate": 9.456e-05, "elapsed_time_per_iteration": 6.78886342, "memory(GiB)": 21.51, "elapsed_time": "2h 15m 36s", "remaining_time": "9h 21m 16s", "loss_scale": 1.0, "consumed_samples": 316928, "global_step/max_steps": "1238/6362"} +{"lm loss": 5.13210344, "grad_norm": 0.89011556, "learning_rate": 9.455e-05, "elapsed_time_per_iteration": 6.37112212, "memory(GiB)": 21.51, "elapsed_time": "2h 15m 42s", "remaining_time": "9h 21m 9s", "loss_scale": 1.0, "consumed_samples": 317184, "global_step/max_steps": "1239/6362"} +{"lm loss": 5.13224792, "grad_norm": 0.83176541, "learning_rate": 9.454e-05, "elapsed_time_per_iteration": 6.5308516, "memory(GiB)": 21.51, "elapsed_time": "2h 15m 49s", "remaining_time": "9h 21m 2s", "loss_scale": 1.0, "consumed_samples": 317440, "global_step/max_steps": "1240/6362"} +{"lm loss": 5.14012766, "grad_norm": 0.73423815, "learning_rate": 9.453e-05, "elapsed_time_per_iteration": 6.73941135, "memory(GiB)": 21.51, "elapsed_time": "2h 15m 56s", "remaining_time": "9h 20m 56s", "loss_scale": 1.0, "consumed_samples": 317696, "global_step/max_steps": "1241/6362"} +{"lm loss": 5.1151495, "grad_norm": 0.98770744, "learning_rate": 9.451e-05, "elapsed_time_per_iteration": 6.64519143, "memory(GiB)": 21.51, "elapsed_time": "2h 16m 2s", "remaining_time": "9h 20m 50s", "loss_scale": 1.0, "consumed_samples": 317952, "global_step/max_steps": "1242/6362"} +{"lm loss": 5.1321516, "grad_norm": 1.15651345, "learning_rate": 9.45e-05, "elapsed_time_per_iteration": 6.58901906, "memory(GiB)": 21.51, "elapsed_time": "2h 16m 9s", "remaining_time": "9h 20m 43s", "loss_scale": 1.0, "consumed_samples": 318208, "global_step/max_steps": "1243/6362"} +{"lm loss": 5.14961195, "grad_norm": 0.73749065, "learning_rate": 9.449e-05, "elapsed_time_per_iteration": 6.5170908, "memory(GiB)": 21.51, "elapsed_time": "2h 16m 15s", "remaining_time": "9h 20m 37s", "loss_scale": 1.0, "consumed_samples": 318464, "global_step/max_steps": "1244/6362"} +{"lm loss": 5.14747238, "grad_norm": 0.95211136, "learning_rate": 9.448e-05, "elapsed_time_per_iteration": 6.53999281, "memory(GiB)": 21.51, "elapsed_time": "2h 16m 22s", "remaining_time": "9h 20m 30s", "loss_scale": 1.0, "consumed_samples": 318720, "global_step/max_steps": "1245/6362"} +{"lm loss": 5.12442398, "grad_norm": 0.92356694, "learning_rate": 9.447e-05, "elapsed_time_per_iteration": 6.5549953, "memory(GiB)": 21.51, "elapsed_time": "2h 16m 29s", "remaining_time": "9h 20m 23s", "loss_scale": 1.0, "consumed_samples": 318976, "global_step/max_steps": "1246/6362"} +{"lm loss": 5.1276021, "grad_norm": 0.9062435, "learning_rate": 9.446e-05, "elapsed_time_per_iteration": 6.91211939, "memory(GiB)": 21.51, "elapsed_time": "2h 16m 35s", "remaining_time": "9h 20m 18s", "loss_scale": 1.0, "consumed_samples": 319232, "global_step/max_steps": "1247/6362"} +{"lm loss": 5.14447212, "grad_norm": 0.9401927, "learning_rate": 9.444e-05, "elapsed_time_per_iteration": 6.72331214, "memory(GiB)": 21.51, "elapsed_time": "2h 16m 42s", "remaining_time": "9h 20m 12s", "loss_scale": 1.0, "consumed_samples": 319488, "global_step/max_steps": "1248/6362"} +{"lm loss": 5.13345909, "grad_norm": 0.83734059, "learning_rate": 9.443e-05, "elapsed_time_per_iteration": 6.52314234, "memory(GiB)": 21.51, "elapsed_time": "2h 16m 49s", "remaining_time": "9h 20m 5s", "loss_scale": 1.0, "consumed_samples": 319744, "global_step/max_steps": "1249/6362"} +{"lm loss": 5.13951397, "grad_norm": 0.74526381, "learning_rate": 9.442e-05, "elapsed_time_per_iteration": 6.50712538, "memory(GiB)": 21.51, "elapsed_time": "2h 16m 55s", "remaining_time": "9h 19m 58s", "loss_scale": 1.0, "consumed_samples": 320000, "global_step/max_steps": "1250/6362"} +{"lm loss": 5.14070177, "grad_norm": 0.81658947, "learning_rate": 9.441e-05, "elapsed_time_per_iteration": 6.64628887, "memory(GiB)": 21.51, "elapsed_time": "2h 17m 2s", "remaining_time": "9h 19m 52s", "loss_scale": 1.0, "consumed_samples": 320256, "global_step/max_steps": "1251/6362"} +{"lm loss": 5.14151096, "grad_norm": 0.77950418, "learning_rate": 9.44e-05, "elapsed_time_per_iteration": 6.4219346, "memory(GiB)": 21.51, "elapsed_time": "2h 17m 8s", "remaining_time": "9h 19m 45s", "loss_scale": 1.0, "consumed_samples": 320512, "global_step/max_steps": "1252/6362"} +{"lm loss": 5.12094021, "grad_norm": 0.77933013, "learning_rate": 9.439e-05, "elapsed_time_per_iteration": 6.70039225, "memory(GiB)": 21.51, "elapsed_time": "2h 17m 15s", "remaining_time": "9h 19m 39s", "loss_scale": 1.0, "consumed_samples": 320768, "global_step/max_steps": "1253/6362"} +{"lm loss": 5.13177109, "grad_norm": 0.81768692, "learning_rate": 9.437e-05, "elapsed_time_per_iteration": 6.41766334, "memory(GiB)": 21.51, "elapsed_time": "2h 17m 21s", "remaining_time": "9h 19m 32s", "loss_scale": 1.0, "consumed_samples": 321024, "global_step/max_steps": "1254/6362"} +{"lm loss": 5.13916111, "grad_norm": 0.93622023, "learning_rate": 9.436e-05, "elapsed_time_per_iteration": 6.82673812, "memory(GiB)": 21.51, "elapsed_time": "2h 17m 28s", "remaining_time": "9h 19m 26s", "loss_scale": 1.0, "consumed_samples": 321280, "global_step/max_steps": "1255/6362"} +{"lm loss": 5.15270567, "grad_norm": 0.86840403, "learning_rate": 9.435e-05, "elapsed_time_per_iteration": 6.69131136, "memory(GiB)": 21.51, "elapsed_time": "2h 17m 35s", "remaining_time": "9h 19m 20s", "loss_scale": 1.0, "consumed_samples": 321536, "global_step/max_steps": "1256/6362"} +{"lm loss": 5.13439322, "grad_norm": 0.94413382, "learning_rate": 9.434e-05, "elapsed_time_per_iteration": 6.63970971, "memory(GiB)": 21.51, "elapsed_time": "2h 17m 42s", "remaining_time": "9h 19m 14s", "loss_scale": 1.0, "consumed_samples": 321792, "global_step/max_steps": "1257/6362"} +{"lm loss": 5.1195507, "grad_norm": 1.16604161, "learning_rate": 9.433e-05, "elapsed_time_per_iteration": 6.59250641, "memory(GiB)": 21.51, "elapsed_time": "2h 17m 48s", "remaining_time": "9h 19m 7s", "loss_scale": 1.0, "consumed_samples": 322048, "global_step/max_steps": "1258/6362"} +{"lm loss": 5.13742304, "grad_norm": 0.9305135, "learning_rate": 9.431e-05, "elapsed_time_per_iteration": 6.58795643, "memory(GiB)": 21.51, "elapsed_time": "2h 17m 55s", "remaining_time": "9h 19m 1s", "loss_scale": 1.0, "consumed_samples": 322304, "global_step/max_steps": "1259/6362"} +{"lm loss": 5.17304373, "grad_norm": 1.04766381, "learning_rate": 9.43e-05, "elapsed_time_per_iteration": 6.53700924, "memory(GiB)": 21.51, "elapsed_time": "2h 18m 1s", "remaining_time": "9h 18m 54s", "loss_scale": 1.0, "consumed_samples": 322560, "global_step/max_steps": "1260/6362"} +{"lm loss": 5.12129927, "grad_norm": 1.08626497, "learning_rate": 9.429e-05, "elapsed_time_per_iteration": 6.51228738, "memory(GiB)": 21.51, "elapsed_time": "2h 18m 8s", "remaining_time": "9h 18m 47s", "loss_scale": 1.0, "consumed_samples": 322816, "global_step/max_steps": "1261/6362"} +{"lm loss": 5.15505457, "grad_norm": 0.85732174, "learning_rate": 9.428e-05, "elapsed_time_per_iteration": 6.52558279, "memory(GiB)": 21.51, "elapsed_time": "2h 18m 14s", "remaining_time": "9h 18m 41s", "loss_scale": 1.0, "consumed_samples": 323072, "global_step/max_steps": "1262/6362"} +{"lm loss": 5.11947966, "grad_norm": 0.87491918, "learning_rate": 9.427e-05, "elapsed_time_per_iteration": 6.71258354, "memory(GiB)": 21.51, "elapsed_time": "2h 18m 21s", "remaining_time": "9h 18m 35s", "loss_scale": 1.0, "consumed_samples": 323328, "global_step/max_steps": "1263/6362"} +{"lm loss": 5.1140604, "grad_norm": 1.02903438, "learning_rate": 9.425e-05, "elapsed_time_per_iteration": 6.66616058, "memory(GiB)": 21.51, "elapsed_time": "2h 18m 28s", "remaining_time": "9h 18m 28s", "loss_scale": 1.0, "consumed_samples": 323584, "global_step/max_steps": "1264/6362"} +{"lm loss": 5.11960077, "grad_norm": 1.00601971, "learning_rate": 9.424e-05, "elapsed_time_per_iteration": 6.62317991, "memory(GiB)": 21.51, "elapsed_time": "2h 18m 34s", "remaining_time": "9h 18m 22s", "loss_scale": 1.0, "consumed_samples": 323840, "global_step/max_steps": "1265/6362"} +{"lm loss": 5.11685514, "grad_norm": 1.03180456, "learning_rate": 9.423e-05, "elapsed_time_per_iteration": 6.54806876, "memory(GiB)": 21.51, "elapsed_time": "2h 18m 41s", "remaining_time": "9h 18m 15s", "loss_scale": 1.0, "consumed_samples": 324096, "global_step/max_steps": "1266/6362"} +{"lm loss": 5.16010714, "grad_norm": 0.91409552, "learning_rate": 9.422e-05, "elapsed_time_per_iteration": 6.5491271, "memory(GiB)": 21.51, "elapsed_time": "2h 18m 47s", "remaining_time": "9h 18m 9s", "loss_scale": 1.0, "consumed_samples": 324352, "global_step/max_steps": "1267/6362"} +{"lm loss": 5.13891506, "grad_norm": 0.89513648, "learning_rate": 9.421e-05, "elapsed_time_per_iteration": 6.46692419, "memory(GiB)": 21.51, "elapsed_time": "2h 18m 54s", "remaining_time": "9h 18m 2s", "loss_scale": 1.0, "consumed_samples": 324608, "global_step/max_steps": "1268/6362"} +{"lm loss": 5.122262, "grad_norm": 0.82564116, "learning_rate": 9.42e-05, "elapsed_time_per_iteration": 6.50382853, "memory(GiB)": 21.51, "elapsed_time": "2h 19m 0s", "remaining_time": "9h 17m 55s", "loss_scale": 1.0, "consumed_samples": 324864, "global_step/max_steps": "1269/6362"} +{"lm loss": 5.12754393, "grad_norm": 0.74723423, "learning_rate": 9.418e-05, "elapsed_time_per_iteration": 6.51650643, "memory(GiB)": 21.51, "elapsed_time": "2h 19m 7s", "remaining_time": "9h 17m 48s", "loss_scale": 1.0, "consumed_samples": 325120, "global_step/max_steps": "1270/6362"} +{"lm loss": 5.14077711, "grad_norm": 0.73482287, "learning_rate": 9.417e-05, "elapsed_time_per_iteration": 6.51074982, "memory(GiB)": 21.51, "elapsed_time": "2h 19m 13s", "remaining_time": "9h 17m 41s", "loss_scale": 1.0, "consumed_samples": 325376, "global_step/max_steps": "1271/6362"} +{"lm loss": 5.13080263, "grad_norm": 0.77962106, "learning_rate": 9.416e-05, "elapsed_time_per_iteration": 6.72091007, "memory(GiB)": 21.51, "elapsed_time": "2h 19m 20s", "remaining_time": "9h 17m 35s", "loss_scale": 1.0, "consumed_samples": 325632, "global_step/max_steps": "1272/6362"} +{"lm loss": 5.11516237, "grad_norm": 0.76546866, "learning_rate": 9.415e-05, "elapsed_time_per_iteration": 6.51835227, "memory(GiB)": 21.51, "elapsed_time": "2h 19m 27s", "remaining_time": "9h 17m 28s", "loss_scale": 1.0, "consumed_samples": 325888, "global_step/max_steps": "1273/6362"} +{"lm loss": 5.12631464, "grad_norm": 0.82212508, "learning_rate": 9.414e-05, "elapsed_time_per_iteration": 6.69808006, "memory(GiB)": 21.51, "elapsed_time": "2h 19m 33s", "remaining_time": "9h 17m 22s", "loss_scale": 1.0, "consumed_samples": 326144, "global_step/max_steps": "1274/6362"} +{"lm loss": 5.12153387, "grad_norm": 0.84525687, "learning_rate": 9.412e-05, "elapsed_time_per_iteration": 6.59308338, "memory(GiB)": 21.51, "elapsed_time": "2h 19m 40s", "remaining_time": "9h 17m 16s", "loss_scale": 1.0, "consumed_samples": 326400, "global_step/max_steps": "1275/6362"} +{"lm loss": 5.11127806, "grad_norm": 0.96697694, "learning_rate": 9.411e-05, "elapsed_time_per_iteration": 6.59370708, "memory(GiB)": 21.51, "elapsed_time": "2h 19m 47s", "remaining_time": "9h 17m 9s", "loss_scale": 1.0, "consumed_samples": 326656, "global_step/max_steps": "1276/6362"} +{"lm loss": 5.13041306, "grad_norm": 1.043468, "learning_rate": 9.41e-05, "elapsed_time_per_iteration": 6.63140678, "memory(GiB)": 21.51, "elapsed_time": "2h 19m 53s", "remaining_time": "9h 17m 3s", "loss_scale": 1.0, "consumed_samples": 326912, "global_step/max_steps": "1277/6362"} +{"lm loss": 5.11738539, "grad_norm": 0.99665916, "learning_rate": 9.409e-05, "elapsed_time_per_iteration": 6.56951189, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 0s", "remaining_time": "9h 16m 56s", "loss_scale": 1.0, "consumed_samples": 327168, "global_step/max_steps": "1278/6362"} +{"lm loss": 5.14318323, "grad_norm": 0.9337768, "learning_rate": 9.408e-05, "elapsed_time_per_iteration": 6.54555655, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 6s", "remaining_time": "9h 16m 50s", "loss_scale": 1.0, "consumed_samples": 327424, "global_step/max_steps": "1279/6362"} +{"lm loss": 5.13161612, "grad_norm": 0.87353456, "learning_rate": 9.406e-05, "elapsed_time_per_iteration": 6.42428827, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 13s", "remaining_time": "9h 16m 43s", "loss_scale": 1.0, "consumed_samples": 327680, "global_step/max_steps": "1280/6362"} +{"lm loss": 5.12834978, "grad_norm": 0.86323822, "learning_rate": 9.405e-05, "elapsed_time_per_iteration": 6.56351686, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 19s", "remaining_time": "9h 16m 36s", "loss_scale": 1.0, "consumed_samples": 327936, "global_step/max_steps": "1281/6362"} +{"lm loss": 5.10949516, "grad_norm": 0.9833591, "learning_rate": 9.404e-05, "elapsed_time_per_iteration": 6.70038033, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 26s", "remaining_time": "9h 16m 30s", "loss_scale": 1.0, "consumed_samples": 328192, "global_step/max_steps": "1282/6362"} +{"lm loss": 5.1411891, "grad_norm": 1.32132709, "learning_rate": 9.403e-05, "elapsed_time_per_iteration": 6.50884461, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 32s", "remaining_time": "9h 16m 23s", "loss_scale": 1.0, "consumed_samples": 328448, "global_step/max_steps": "1283/6362"} +{"lm loss": 5.12746716, "grad_norm": 0.65914303, "learning_rate": 9.401e-05, "elapsed_time_per_iteration": 6.56978559, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 39s", "remaining_time": "9h 16m 16s", "loss_scale": 1.0, "consumed_samples": 328704, "global_step/max_steps": "1284/6362"} +{"lm loss": 5.11724854, "grad_norm": 0.83876723, "learning_rate": 9.4e-05, "elapsed_time_per_iteration": 6.87334752, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 46s", "remaining_time": "9h 16m 11s", "loss_scale": 1.0, "consumed_samples": 328960, "global_step/max_steps": "1285/6362"} +{"lm loss": 5.1305232, "grad_norm": 1.04718745, "learning_rate": 9.399e-05, "elapsed_time_per_iteration": 6.54061151, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 52s", "remaining_time": "9h 16m 4s", "loss_scale": 1.0, "consumed_samples": 329216, "global_step/max_steps": "1286/6362"} +{"lm loss": 5.10423756, "grad_norm": 0.93411344, "learning_rate": 9.398e-05, "elapsed_time_per_iteration": 6.62772346, "memory(GiB)": 21.51, "elapsed_time": "2h 20m 59s", "remaining_time": "9h 15m 58s", "loss_scale": 1.0, "consumed_samples": 329472, "global_step/max_steps": "1287/6362"} +{"lm loss": 5.13240814, "grad_norm": 0.92808902, "learning_rate": 9.397e-05, "elapsed_time_per_iteration": 6.44494987, "memory(GiB)": 21.51, "elapsed_time": "2h 21m 6s", "remaining_time": "9h 15m 51s", "loss_scale": 1.0, "consumed_samples": 329728, "global_step/max_steps": "1288/6362"} +{"lm loss": 5.12803841, "grad_norm": 0.87180787, "learning_rate": 9.395e-05, "elapsed_time_per_iteration": 6.34625649, "memory(GiB)": 21.51, "elapsed_time": "2h 21m 12s", "remaining_time": "9h 15m 43s", "loss_scale": 1.0, "consumed_samples": 329984, "global_step/max_steps": "1289/6362"} +{"lm loss": 5.11896086, "grad_norm": 0.92070782, "learning_rate": 9.394e-05, "elapsed_time_per_iteration": 6.43319631, "memory(GiB)": 21.51, "elapsed_time": "2h 21m 18s", "remaining_time": "9h 15m 36s", "loss_scale": 1.0, "consumed_samples": 330240, "global_step/max_steps": "1290/6362"} +{"lm loss": 5.13346148, "grad_norm": 1.1183126, "learning_rate": 9.393e-05, "elapsed_time_per_iteration": 6.46860051, "memory(GiB)": 21.51, "elapsed_time": "2h 21m 25s", "remaining_time": "9h 15m 29s", "loss_scale": 1.0, "consumed_samples": 330496, "global_step/max_steps": "1291/6362"} +{"lm loss": 5.14530373, "grad_norm": 1.10656095, "learning_rate": 9.392e-05, "elapsed_time_per_iteration": 6.47585297, "memory(GiB)": 21.51, "elapsed_time": "2h 21m 31s", "remaining_time": "9h 15m 22s", "loss_scale": 1.0, "consumed_samples": 330752, "global_step/max_steps": "1292/6362"} +{"lm loss": 5.14479876, "grad_norm": 0.99106514, "learning_rate": 9.39e-05, "elapsed_time_per_iteration": 6.42195511, "memory(GiB)": 21.51, "elapsed_time": "2h 21m 38s", "remaining_time": "9h 15m 15s", "loss_scale": 1.0, "consumed_samples": 331008, "global_step/max_steps": "1293/6362"} +{"lm loss": 5.1467371, "grad_norm": 0.98042583, "learning_rate": 9.389e-05, "elapsed_time_per_iteration": 6.47021222, "memory(GiB)": 21.51, "elapsed_time": "2h 21m 44s", "remaining_time": "9h 15m 8s", "loss_scale": 1.0, "consumed_samples": 331264, "global_step/max_steps": "1294/6362"} +{"lm loss": 5.11180067, "grad_norm": 0.95520324, "learning_rate": 9.388e-05, "elapsed_time_per_iteration": 6.62443948, "memory(GiB)": 21.51, "elapsed_time": "2h 21m 51s", "remaining_time": "9h 15m 2s", "loss_scale": 1.0, "consumed_samples": 331520, "global_step/max_steps": "1295/6362"} +{"lm loss": 5.14174032, "grad_norm": 0.81231844, "learning_rate": 9.387e-05, "elapsed_time_per_iteration": 6.36636543, "memory(GiB)": 21.51, "elapsed_time": "2h 21m 57s", "remaining_time": "9h 14m 55s", "loss_scale": 1.0, "consumed_samples": 331776, "global_step/max_steps": "1296/6362"} +{"lm loss": 5.12622738, "grad_norm": 0.74855101, "learning_rate": 9.386e-05, "elapsed_time_per_iteration": 6.5414412, "memory(GiB)": 21.51, "elapsed_time": "2h 22m 4s", "remaining_time": "9h 14m 48s", "loss_scale": 1.0, "consumed_samples": 332032, "global_step/max_steps": "1297/6362"} +{"lm loss": 5.13450956, "grad_norm": 0.76958555, "learning_rate": 9.384e-05, "elapsed_time_per_iteration": 6.4438386, "memory(GiB)": 21.51, "elapsed_time": "2h 22m 10s", "remaining_time": "9h 14m 41s", "loss_scale": 1.0, "consumed_samples": 332288, "global_step/max_steps": "1298/6362"} +{"lm loss": 5.12175179, "grad_norm": 0.83694005, "learning_rate": 9.383e-05, "elapsed_time_per_iteration": 6.45557952, "memory(GiB)": 21.51, "elapsed_time": "2h 22m 17s", "remaining_time": "9h 14m 34s", "loss_scale": 1.0, "consumed_samples": 332544, "global_step/max_steps": "1299/6362"} +{"lm loss": 5.13402414, "grad_norm": 0.80356699, "learning_rate": 9.382e-05, "elapsed_time_per_iteration": 6.59050465, "memory(GiB)": 21.51, "elapsed_time": "2h 22m 23s", "remaining_time": "9h 14m 27s", "loss_scale": 1.0, "consumed_samples": 332800, "global_step/max_steps": "1300/6362"} +{"lm loss": 5.11797094, "grad_norm": 0.78027678, "learning_rate": 9.381e-05, "elapsed_time_per_iteration": 6.65439677, "memory(GiB)": 21.51, "elapsed_time": "2h 22m 30s", "remaining_time": "9h 14m 21s", "loss_scale": 1.0, "consumed_samples": 333056, "global_step/max_steps": "1301/6362"} +{"lm loss": 5.13014221, "grad_norm": 0.7919544, "learning_rate": 9.379e-05, "elapsed_time_per_iteration": 6.7755568, "memory(GiB)": 21.51, "elapsed_time": "2h 22m 37s", "remaining_time": "9h 14m 15s", "loss_scale": 1.0, "consumed_samples": 333312, "global_step/max_steps": "1302/6362"} +{"lm loss": 5.12781334, "grad_norm": 0.87222081, "learning_rate": 9.378e-05, "elapsed_time_per_iteration": 6.52647161, "memory(GiB)": 21.51, "elapsed_time": "2h 22m 43s", "remaining_time": "9h 14m 8s", "loss_scale": 1.0, "consumed_samples": 333568, "global_step/max_steps": "1303/6362"} +{"lm loss": 5.12462616, "grad_norm": 0.96808296, "learning_rate": 9.377e-05, "elapsed_time_per_iteration": 6.84428, "memory(GiB)": 21.51, "elapsed_time": "2h 22m 50s", "remaining_time": "9h 14m 3s", "loss_scale": 1.0, "consumed_samples": 333824, "global_step/max_steps": "1304/6362"} +{"lm loss": 5.14498949, "grad_norm": 0.89371979, "learning_rate": 9.376e-05, "elapsed_time_per_iteration": 6.56244397, "memory(GiB)": 21.51, "elapsed_time": "2h 22m 57s", "remaining_time": "9h 13m 56s", "loss_scale": 1.0, "consumed_samples": 334080, "global_step/max_steps": "1305/6362"} +{"lm loss": 5.11717987, "grad_norm": 0.89924258, "learning_rate": 9.374e-05, "elapsed_time_per_iteration": 6.43409896, "memory(GiB)": 21.51, "elapsed_time": "2h 23m 3s", "remaining_time": "9h 13m 49s", "loss_scale": 1.0, "consumed_samples": 334336, "global_step/max_steps": "1306/6362"} +{"lm loss": 5.11478662, "grad_norm": 1.00168586, "learning_rate": 9.373e-05, "elapsed_time_per_iteration": 6.36805224, "memory(GiB)": 21.51, "elapsed_time": "2h 23m 9s", "remaining_time": "9h 13m 42s", "loss_scale": 1.0, "consumed_samples": 334592, "global_step/max_steps": "1307/6362"} +{"lm loss": 5.11011362, "grad_norm": 1.08848155, "learning_rate": 9.372e-05, "elapsed_time_per_iteration": 6.54293561, "memory(GiB)": 21.51, "elapsed_time": "2h 23m 16s", "remaining_time": "9h 13m 35s", "loss_scale": 1.0, "consumed_samples": 334848, "global_step/max_steps": "1308/6362"} +{"lm loss": 5.13391113, "grad_norm": 0.82049942, "learning_rate": 9.371e-05, "elapsed_time_per_iteration": 6.43293715, "memory(GiB)": 21.51, "elapsed_time": "2h 23m 22s", "remaining_time": "9h 13m 28s", "loss_scale": 1.0, "consumed_samples": 335104, "global_step/max_steps": "1309/6362"} +{"lm loss": 5.12682724, "grad_norm": 0.76273984, "learning_rate": 9.37e-05, "elapsed_time_per_iteration": 6.44551253, "memory(GiB)": 21.51, "elapsed_time": "2h 23m 29s", "remaining_time": "9h 13m 21s", "loss_scale": 1.0, "consumed_samples": 335360, "global_step/max_steps": "1310/6362"} +{"lm loss": 5.11589479, "grad_norm": 0.88240814, "learning_rate": 9.368e-05, "elapsed_time_per_iteration": 6.54773545, "memory(GiB)": 21.51, "elapsed_time": "2h 23m 35s", "remaining_time": "9h 13m 14s", "loss_scale": 1.0, "consumed_samples": 335616, "global_step/max_steps": "1311/6362"} +{"lm loss": 5.122159, "grad_norm": 0.9060055, "learning_rate": 9.367e-05, "elapsed_time_per_iteration": 6.43414736, "memory(GiB)": 21.51, "elapsed_time": "2h 23m 42s", "remaining_time": "9h 13m 7s", "loss_scale": 1.0, "consumed_samples": 335872, "global_step/max_steps": "1312/6362"} +{"lm loss": 5.10567617, "grad_norm": 0.87344724, "learning_rate": 9.366e-05, "elapsed_time_per_iteration": 6.48083496, "memory(GiB)": 21.51, "elapsed_time": "2h 23m 48s", "remaining_time": "9h 13m 0s", "loss_scale": 1.0, "consumed_samples": 336128, "global_step/max_steps": "1313/6362"} +{"lm loss": 5.10869932, "grad_norm": 0.8751983, "learning_rate": 9.365e-05, "elapsed_time_per_iteration": 6.53227925, "memory(GiB)": 21.51, "elapsed_time": "2h 23m 55s", "remaining_time": "9h 12m 54s", "loss_scale": 1.0, "consumed_samples": 336384, "global_step/max_steps": "1314/6362"} +{"lm loss": 5.11142969, "grad_norm": 0.85590059, "learning_rate": 9.363e-05, "elapsed_time_per_iteration": 6.4304831, "memory(GiB)": 21.51, "elapsed_time": "2h 24m 1s", "remaining_time": "9h 12m 46s", "loss_scale": 1.0, "consumed_samples": 336640, "global_step/max_steps": "1315/6362"} +{"lm loss": 5.13951063, "grad_norm": 0.9614495, "learning_rate": 9.362e-05, "elapsed_time_per_iteration": 6.52818394, "memory(GiB)": 21.51, "elapsed_time": "2h 24m 8s", "remaining_time": "9h 12m 40s", "loss_scale": 1.0, "consumed_samples": 336896, "global_step/max_steps": "1316/6362"} +{"lm loss": 5.1343379, "grad_norm": 1.17772686, "learning_rate": 9.361e-05, "elapsed_time_per_iteration": 6.37807918, "memory(GiB)": 21.51, "elapsed_time": "2h 24m 14s", "remaining_time": "9h 12m 32s", "loss_scale": 1.0, "consumed_samples": 337152, "global_step/max_steps": "1317/6362"} +{"lm loss": 5.11214733, "grad_norm": 1.03188682, "learning_rate": 9.36e-05, "elapsed_time_per_iteration": 6.48997188, "memory(GiB)": 21.51, "elapsed_time": "2h 24m 21s", "remaining_time": "9h 12m 26s", "loss_scale": 1.0, "consumed_samples": 337408, "global_step/max_steps": "1318/6362"} +{"lm loss": 5.1392808, "grad_norm": 0.96088517, "learning_rate": 9.358e-05, "elapsed_time_per_iteration": 6.43404436, "memory(GiB)": 21.51, "elapsed_time": "2h 24m 27s", "remaining_time": "9h 12m 18s", "loss_scale": 1.0, "consumed_samples": 337664, "global_step/max_steps": "1319/6362"} +{"lm loss": 5.10207796, "grad_norm": 0.93995923, "learning_rate": 9.357e-05, "elapsed_time_per_iteration": 6.52538395, "memory(GiB)": 21.51, "elapsed_time": "2h 24m 34s", "remaining_time": "9h 12m 12s", "loss_scale": 1.0, "consumed_samples": 337920, "global_step/max_steps": "1320/6362"} +{"lm loss": 5.11226606, "grad_norm": 0.84019607, "learning_rate": 9.356e-05, "elapsed_time_per_iteration": 6.59943342, "memory(GiB)": 21.51, "elapsed_time": "2h 24m 40s", "remaining_time": "9h 12m 5s", "loss_scale": 1.0, "consumed_samples": 338176, "global_step/max_steps": "1321/6362"} +{"lm loss": 5.12339687, "grad_norm": 0.81638426, "learning_rate": 9.355e-05, "elapsed_time_per_iteration": 6.44434404, "memory(GiB)": 21.51, "elapsed_time": "2h 24m 47s", "remaining_time": "9h 11m 58s", "loss_scale": 1.0, "consumed_samples": 338432, "global_step/max_steps": "1322/6362"} +{"lm loss": 5.12826109, "grad_norm": 0.82369, "learning_rate": 9.353e-05, "elapsed_time_per_iteration": 6.49588323, "memory(GiB)": 21.51, "elapsed_time": "2h 24m 53s", "remaining_time": "9h 11m 51s", "loss_scale": 1.0, "consumed_samples": 338688, "global_step/max_steps": "1323/6362"} +{"lm loss": 5.10604239, "grad_norm": 0.78487545, "learning_rate": 9.352e-05, "elapsed_time_per_iteration": 6.75207782, "memory(GiB)": 21.51, "elapsed_time": "2h 25m 0s", "remaining_time": "9h 11m 45s", "loss_scale": 1.0, "consumed_samples": 338944, "global_step/max_steps": "1324/6362"} +{"lm loss": 5.12786913, "grad_norm": 0.83170664, "learning_rate": 9.351e-05, "elapsed_time_per_iteration": 6.75508165, "memory(GiB)": 21.51, "elapsed_time": "2h 25m 7s", "remaining_time": "9h 11m 40s", "loss_scale": 1.0, "consumed_samples": 339200, "global_step/max_steps": "1325/6362"} +{"lm loss": 5.1486721, "grad_norm": 0.85506558, "learning_rate": 9.349e-05, "elapsed_time_per_iteration": 6.50766635, "memory(GiB)": 21.51, "elapsed_time": "2h 25m 13s", "remaining_time": "9h 11m 33s", "loss_scale": 1.0, "consumed_samples": 339456, "global_step/max_steps": "1326/6362"} +{"lm loss": 5.1206851, "grad_norm": 0.76949048, "learning_rate": 9.348e-05, "elapsed_time_per_iteration": 6.75395513, "memory(GiB)": 21.51, "elapsed_time": "2h 25m 20s", "remaining_time": "9h 11m 27s", "loss_scale": 1.0, "consumed_samples": 339712, "global_step/max_steps": "1327/6362"} +{"lm loss": 5.10646009, "grad_norm": 0.78281391, "learning_rate": 9.347e-05, "elapsed_time_per_iteration": 6.80098271, "memory(GiB)": 21.51, "elapsed_time": "2h 25m 27s", "remaining_time": "9h 11m 21s", "loss_scale": 1.0, "consumed_samples": 339968, "global_step/max_steps": "1328/6362"} +{"lm loss": 5.1186614, "grad_norm": 0.74830693, "learning_rate": 9.346e-05, "elapsed_time_per_iteration": 6.85826874, "memory(GiB)": 21.51, "elapsed_time": "2h 25m 34s", "remaining_time": "9h 11m 16s", "loss_scale": 1.0, "consumed_samples": 340224, "global_step/max_steps": "1329/6362"} +{"lm loss": 5.11061001, "grad_norm": 0.78573191, "learning_rate": 9.344e-05, "elapsed_time_per_iteration": 6.6067276, "memory(GiB)": 21.51, "elapsed_time": "2h 25m 40s", "remaining_time": "9h 11m 9s", "loss_scale": 1.0, "consumed_samples": 340480, "global_step/max_steps": "1330/6362"} +{"lm loss": 5.12034369, "grad_norm": 0.82291633, "learning_rate": 9.343e-05, "elapsed_time_per_iteration": 6.5313201, "memory(GiB)": 21.51, "elapsed_time": "2h 25m 47s", "remaining_time": "9h 11m 3s", "loss_scale": 1.0, "consumed_samples": 340736, "global_step/max_steps": "1331/6362"} +{"lm loss": 5.11598921, "grad_norm": 0.81526709, "learning_rate": 9.342e-05, "elapsed_time_per_iteration": 6.51488614, "memory(GiB)": 21.51, "elapsed_time": "2h 25m 53s", "remaining_time": "9h 10m 56s", "loss_scale": 1.0, "consumed_samples": 340992, "global_step/max_steps": "1332/6362"} +{"lm loss": 5.12248182, "grad_norm": 0.88754505, "learning_rate": 9.341e-05, "elapsed_time_per_iteration": 6.7180934, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 0s", "remaining_time": "9h 10m 50s", "loss_scale": 1.0, "consumed_samples": 341248, "global_step/max_steps": "1333/6362"} +{"lm loss": 5.11850929, "grad_norm": 0.91548485, "learning_rate": 9.339e-05, "elapsed_time_per_iteration": 6.64774299, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 7s", "remaining_time": "9h 10m 43s", "loss_scale": 1.0, "consumed_samples": 341504, "global_step/max_steps": "1334/6362"} +{"lm loss": 5.11606979, "grad_norm": 0.90580887, "learning_rate": 9.338e-05, "elapsed_time_per_iteration": 6.48994398, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 13s", "remaining_time": "9h 10m 37s", "loss_scale": 1.0, "consumed_samples": 341760, "global_step/max_steps": "1335/6362"} +{"lm loss": 5.12035418, "grad_norm": 0.88105845, "learning_rate": 9.337e-05, "elapsed_time_per_iteration": 6.57120204, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 20s", "remaining_time": "9h 10m 30s", "loss_scale": 1.0, "consumed_samples": 342016, "global_step/max_steps": "1336/6362"} +{"lm loss": 5.12415171, "grad_norm": 0.86391848, "learning_rate": 9.336e-05, "elapsed_time_per_iteration": 6.53302884, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 26s", "remaining_time": "9h 10m 23s", "loss_scale": 1.0, "consumed_samples": 342272, "global_step/max_steps": "1337/6362"} +{"lm loss": 5.11843348, "grad_norm": 0.87344658, "learning_rate": 9.334e-05, "elapsed_time_per_iteration": 6.48997498, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 33s", "remaining_time": "9h 10m 16s", "loss_scale": 1.0, "consumed_samples": 342528, "global_step/max_steps": "1338/6362"} +{"lm loss": 5.12956095, "grad_norm": 1.06562471, "learning_rate": 9.333e-05, "elapsed_time_per_iteration": 6.51265359, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 39s", "remaining_time": "9h 10m 10s", "loss_scale": 1.0, "consumed_samples": 342784, "global_step/max_steps": "1339/6362"} +{"lm loss": 5.11883926, "grad_norm": 1.08587039, "learning_rate": 9.332e-05, "elapsed_time_per_iteration": 6.32144976, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 45s", "remaining_time": "9h 10m 2s", "loss_scale": 1.0, "consumed_samples": 343040, "global_step/max_steps": "1340/6362"} +{"lm loss": 5.12475491, "grad_norm": 0.87206995, "learning_rate": 9.33e-05, "elapsed_time_per_iteration": 6.32822895, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 52s", "remaining_time": "9h 9m 55s", "loss_scale": 1.0, "consumed_samples": 343296, "global_step/max_steps": "1341/6362"} +{"lm loss": 5.08364058, "grad_norm": 0.83039927, "learning_rate": 9.329e-05, "elapsed_time_per_iteration": 6.56826878, "memory(GiB)": 21.51, "elapsed_time": "2h 26m 58s", "remaining_time": "9h 9m 48s", "loss_scale": 1.0, "consumed_samples": 343552, "global_step/max_steps": "1342/6362"} +{"lm loss": 5.10461903, "grad_norm": 0.88375288, "learning_rate": 9.328e-05, "elapsed_time_per_iteration": 6.42069054, "memory(GiB)": 21.51, "elapsed_time": "2h 27m 5s", "remaining_time": "9h 9m 41s", "loss_scale": 1.0, "consumed_samples": 343808, "global_step/max_steps": "1343/6362"} +{"lm loss": 5.12008238, "grad_norm": 0.80849642, "learning_rate": 9.327e-05, "elapsed_time_per_iteration": 6.35815191, "memory(GiB)": 21.51, "elapsed_time": "2h 27m 11s", "remaining_time": "9h 9m 33s", "loss_scale": 1.0, "consumed_samples": 344064, "global_step/max_steps": "1344/6362"} +{"lm loss": 5.1163311, "grad_norm": 0.77371991, "learning_rate": 9.325e-05, "elapsed_time_per_iteration": 6.56549478, "memory(GiB)": 21.51, "elapsed_time": "2h 27m 18s", "remaining_time": "9h 9m 27s", "loss_scale": 1.0, "consumed_samples": 344320, "global_step/max_steps": "1345/6362"} +{"lm loss": 5.09398508, "grad_norm": 0.89450276, "learning_rate": 9.324e-05, "elapsed_time_per_iteration": 6.66063929, "memory(GiB)": 21.51, "elapsed_time": "2h 27m 24s", "remaining_time": "9h 9m 21s", "loss_scale": 1.0, "consumed_samples": 344576, "global_step/max_steps": "1346/6362"} +{"lm loss": 5.12183237, "grad_norm": 0.80158055, "learning_rate": 9.323e-05, "elapsed_time_per_iteration": 6.45015407, "memory(GiB)": 21.51, "elapsed_time": "2h 27m 31s", "remaining_time": "9h 9m 14s", "loss_scale": 1.0, "consumed_samples": 344832, "global_step/max_steps": "1347/6362"} +{"lm loss": 5.12336302, "grad_norm": 0.73303813, "learning_rate": 9.321e-05, "elapsed_time_per_iteration": 6.63182831, "memory(GiB)": 21.51, "elapsed_time": "2h 27m 37s", "remaining_time": "9h 9m 7s", "loss_scale": 1.0, "consumed_samples": 345088, "global_step/max_steps": "1348/6362"} +{"lm loss": 5.1187582, "grad_norm": 0.72886771, "learning_rate": 9.32e-05, "elapsed_time_per_iteration": 6.74016237, "memory(GiB)": 21.51, "elapsed_time": "2h 27m 44s", "remaining_time": "9h 9m 1s", "loss_scale": 1.0, "consumed_samples": 345344, "global_step/max_steps": "1349/6362"} +{"lm loss": 5.11862135, "grad_norm": 0.8020705, "learning_rate": 9.319e-05, "elapsed_time_per_iteration": 6.51900458, "memory(GiB)": 21.51, "elapsed_time": "2h 27m 51s", "remaining_time": "9h 8m 55s", "loss_scale": 1.0, "consumed_samples": 345600, "global_step/max_steps": "1350/6362"} +{"lm loss": 5.11123753, "grad_norm": 0.87003517, "learning_rate": 9.318e-05, "elapsed_time_per_iteration": 6.71108437, "memory(GiB)": 21.51, "elapsed_time": "2h 27m 57s", "remaining_time": "9h 8m 49s", "loss_scale": 1.0, "consumed_samples": 345856, "global_step/max_steps": "1351/6362"} +{"lm loss": 5.1380291, "grad_norm": 0.93605059, "learning_rate": 9.316e-05, "elapsed_time_per_iteration": 6.73479629, "memory(GiB)": 21.51, "elapsed_time": "2h 28m 4s", "remaining_time": "9h 8m 43s", "loss_scale": 1.0, "consumed_samples": 346112, "global_step/max_steps": "1352/6362"} +{"lm loss": 5.11077785, "grad_norm": 0.92341524, "learning_rate": 9.315e-05, "elapsed_time_per_iteration": 6.58225632, "memory(GiB)": 21.51, "elapsed_time": "2h 28m 11s", "remaining_time": "9h 8m 36s", "loss_scale": 1.0, "consumed_samples": 346368, "global_step/max_steps": "1353/6362"} +{"lm loss": 5.13951683, "grad_norm": 0.9512822, "learning_rate": 9.314e-05, "elapsed_time_per_iteration": 6.68415713, "memory(GiB)": 21.51, "elapsed_time": "2h 28m 17s", "remaining_time": "9h 8m 30s", "loss_scale": 1.0, "consumed_samples": 346624, "global_step/max_steps": "1354/6362"} +{"lm loss": 5.12054634, "grad_norm": 1.05049491, "learning_rate": 9.312e-05, "elapsed_time_per_iteration": 6.44753098, "memory(GiB)": 21.51, "elapsed_time": "2h 28m 24s", "remaining_time": "9h 8m 23s", "loss_scale": 1.0, "consumed_samples": 346880, "global_step/max_steps": "1355/6362"} +{"lm loss": 5.10798645, "grad_norm": 0.97182792, "learning_rate": 9.311e-05, "elapsed_time_per_iteration": 6.61982036, "memory(GiB)": 21.51, "elapsed_time": "2h 28m 30s", "remaining_time": "9h 8m 16s", "loss_scale": 1.0, "consumed_samples": 347136, "global_step/max_steps": "1356/6362"} +{"lm loss": 5.1155715, "grad_norm": 1.14529705, "learning_rate": 9.31e-05, "elapsed_time_per_iteration": 6.75915527, "memory(GiB)": 21.51, "elapsed_time": "2h 28m 37s", "remaining_time": "9h 8m 11s", "loss_scale": 1.0, "consumed_samples": 347392, "global_step/max_steps": "1357/6362"} +{"lm loss": 5.11271811, "grad_norm": 0.92059165, "learning_rate": 9.309e-05, "elapsed_time_per_iteration": 6.63185382, "memory(GiB)": 21.51, "elapsed_time": "2h 28m 44s", "remaining_time": "9h 8m 4s", "loss_scale": 1.0, "consumed_samples": 347648, "global_step/max_steps": "1358/6362"} +{"lm loss": 5.1295228, "grad_norm": 0.86522394, "learning_rate": 9.307e-05, "elapsed_time_per_iteration": 6.59970307, "memory(GiB)": 21.51, "elapsed_time": "2h 28m 50s", "remaining_time": "9h 7m 58s", "loss_scale": 1.0, "consumed_samples": 347904, "global_step/max_steps": "1359/6362"} +{"lm loss": 5.09116411, "grad_norm": 0.87563819, "learning_rate": 9.306e-05, "elapsed_time_per_iteration": 6.3641212, "memory(GiB)": 21.51, "elapsed_time": "2h 28m 57s", "remaining_time": "9h 7m 50s", "loss_scale": 1.0, "consumed_samples": 348160, "global_step/max_steps": "1360/6362"} +{"lm loss": 5.12277985, "grad_norm": 0.86659294, "learning_rate": 9.305e-05, "elapsed_time_per_iteration": 6.39211535, "memory(GiB)": 21.51, "elapsed_time": "2h 29m 3s", "remaining_time": "9h 7m 43s", "loss_scale": 1.0, "consumed_samples": 348416, "global_step/max_steps": "1361/6362"} +{"lm loss": 5.10768604, "grad_norm": 0.8174147, "learning_rate": 9.303e-05, "elapsed_time_per_iteration": 6.29126811, "memory(GiB)": 21.51, "elapsed_time": "2h 29m 10s", "remaining_time": "9h 7m 36s", "loss_scale": 1.0, "consumed_samples": 348672, "global_step/max_steps": "1362/6362"} +{"lm loss": 5.13190413, "grad_norm": 0.92317009, "learning_rate": 9.302e-05, "elapsed_time_per_iteration": 6.4628005, "memory(GiB)": 21.51, "elapsed_time": "2h 29m 16s", "remaining_time": "9h 7m 29s", "loss_scale": 1.0, "consumed_samples": 348928, "global_step/max_steps": "1363/6362"} +{"lm loss": 5.12362528, "grad_norm": 0.92627203, "learning_rate": 9.301e-05, "elapsed_time_per_iteration": 6.54477429, "memory(GiB)": 21.51, "elapsed_time": "2h 29m 23s", "remaining_time": "9h 7m 22s", "loss_scale": 1.0, "consumed_samples": 349184, "global_step/max_steps": "1364/6362"} +{"lm loss": 5.09763002, "grad_norm": 0.95968115, "learning_rate": 9.299e-05, "elapsed_time_per_iteration": 6.48273754, "memory(GiB)": 21.51, "elapsed_time": "2h 29m 29s", "remaining_time": "9h 7m 15s", "loss_scale": 1.0, "consumed_samples": 349440, "global_step/max_steps": "1365/6362"} +{"lm loss": 5.1332469, "grad_norm": 0.98754901, "learning_rate": 9.298e-05, "elapsed_time_per_iteration": 6.34051824, "memory(GiB)": 21.51, "elapsed_time": "2h 29m 35s", "remaining_time": "9h 7m 8s", "loss_scale": 1.0, "consumed_samples": 349696, "global_step/max_steps": "1366/6362"} +{"lm loss": 5.13485479, "grad_norm": 0.9330079, "learning_rate": 9.297e-05, "elapsed_time_per_iteration": 6.43375802, "memory(GiB)": 21.51, "elapsed_time": "2h 29m 42s", "remaining_time": "9h 7m 1s", "loss_scale": 1.0, "consumed_samples": 349952, "global_step/max_steps": "1367/6362"} +{"lm loss": 5.09829617, "grad_norm": 0.79636073, "learning_rate": 9.296e-05, "elapsed_time_per_iteration": 6.78595877, "memory(GiB)": 21.51, "elapsed_time": "2h 29m 49s", "remaining_time": "9h 6m 55s", "loss_scale": 1.0, "consumed_samples": 350208, "global_step/max_steps": "1368/6362"} +{"lm loss": 5.09535217, "grad_norm": 0.76451731, "learning_rate": 9.294e-05, "elapsed_time_per_iteration": 6.71243191, "memory(GiB)": 21.51, "elapsed_time": "2h 29m 55s", "remaining_time": "9h 6m 49s", "loss_scale": 1.0, "consumed_samples": 350464, "global_step/max_steps": "1369/6362"} +{"lm loss": 5.11278725, "grad_norm": 0.76755232, "learning_rate": 9.293e-05, "elapsed_time_per_iteration": 6.4000001, "memory(GiB)": 21.51, "elapsed_time": "2h 30m 2s", "remaining_time": "9h 6m 42s", "loss_scale": 1.0, "consumed_samples": 350720, "global_step/max_steps": "1370/6362"} +{"lm loss": 5.11246061, "grad_norm": 0.69134521, "learning_rate": 9.292e-05, "elapsed_time_per_iteration": 6.51719069, "memory(GiB)": 21.51, "elapsed_time": "2h 30m 8s", "remaining_time": "9h 6m 35s", "loss_scale": 1.0, "consumed_samples": 350976, "global_step/max_steps": "1371/6362"} +{"lm loss": 5.11008358, "grad_norm": 0.642811, "learning_rate": 9.29e-05, "elapsed_time_per_iteration": 6.66693521, "memory(GiB)": 21.51, "elapsed_time": "2h 30m 15s", "remaining_time": "9h 6m 29s", "loss_scale": 1.0, "consumed_samples": 351232, "global_step/max_steps": "1372/6362"} +{"lm loss": 5.12157631, "grad_norm": 0.710729, "learning_rate": 9.289e-05, "elapsed_time_per_iteration": 6.8269527, "memory(GiB)": 21.51, "elapsed_time": "2h 30m 22s", "remaining_time": "9h 6m 23s", "loss_scale": 1.0, "consumed_samples": 351488, "global_step/max_steps": "1373/6362"} +{"lm loss": 5.11154318, "grad_norm": 0.70210367, "learning_rate": 9.288e-05, "elapsed_time_per_iteration": 6.53671098, "memory(GiB)": 21.51, "elapsed_time": "2h 30m 28s", "remaining_time": "9h 6m 16s", "loss_scale": 1.0, "consumed_samples": 351744, "global_step/max_steps": "1374/6362"} +{"lm loss": 5.11209965, "grad_norm": 0.74898601, "learning_rate": 9.286e-05, "elapsed_time_per_iteration": 6.69256067, "memory(GiB)": 21.51, "elapsed_time": "2h 30m 35s", "remaining_time": "9h 6m 10s", "loss_scale": 1.0, "consumed_samples": 352000, "global_step/max_steps": "1375/6362"} +{"lm loss": 5.12358141, "grad_norm": 0.85721159, "learning_rate": 9.285e-05, "elapsed_time_per_iteration": 6.76954484, "memory(GiB)": 21.51, "elapsed_time": "2h 30m 42s", "remaining_time": "9h 6m 4s", "loss_scale": 1.0, "consumed_samples": 352256, "global_step/max_steps": "1376/6362"} +{"lm loss": 5.10002661, "grad_norm": 0.95169997, "learning_rate": 9.284e-05, "elapsed_time_per_iteration": 6.63844991, "memory(GiB)": 21.51, "elapsed_time": "2h 30m 48s", "remaining_time": "9h 5m 58s", "loss_scale": 1.0, "consumed_samples": 352512, "global_step/max_steps": "1377/6362"} +{"lm loss": 5.12392282, "grad_norm": 1.09628367, "learning_rate": 9.282e-05, "elapsed_time_per_iteration": 6.64840865, "memory(GiB)": 21.51, "elapsed_time": "2h 30m 55s", "remaining_time": "9h 5m 52s", "loss_scale": 1.0, "consumed_samples": 352768, "global_step/max_steps": "1378/6362"} +{"lm loss": 5.10327864, "grad_norm": 0.94329071, "learning_rate": 9.281e-05, "elapsed_time_per_iteration": 6.50654268, "memory(GiB)": 21.51, "elapsed_time": "2h 31m 1s", "remaining_time": "9h 5m 45s", "loss_scale": 1.0, "consumed_samples": 353024, "global_step/max_steps": "1379/6362"} +{"lm loss": 5.1174078, "grad_norm": 0.92211002, "learning_rate": 9.28e-05, "elapsed_time_per_iteration": 6.52684188, "memory(GiB)": 21.51, "elapsed_time": "2h 31m 8s", "remaining_time": "9h 5m 38s", "loss_scale": 1.0, "consumed_samples": 353280, "global_step/max_steps": "1380/6362"} +{"lm loss": 5.08837271, "grad_norm": 1.0473218, "learning_rate": 9.278e-05, "elapsed_time_per_iteration": 6.46386862, "memory(GiB)": 21.51, "elapsed_time": "2h 31m 14s", "remaining_time": "9h 5m 31s", "loss_scale": 1.0, "consumed_samples": 353536, "global_step/max_steps": "1381/6362"} +{"lm loss": 5.11768103, "grad_norm": 0.94561285, "learning_rate": 9.277e-05, "elapsed_time_per_iteration": 6.52596688, "memory(GiB)": 21.51, "elapsed_time": "2h 31m 21s", "remaining_time": "9h 5m 24s", "loss_scale": 1.0, "consumed_samples": 353792, "global_step/max_steps": "1382/6362"} +{"lm loss": 5.09449434, "grad_norm": 0.92553282, "learning_rate": 9.276e-05, "elapsed_time_per_iteration": 6.48672438, "memory(GiB)": 21.51, "elapsed_time": "2h 31m 27s", "remaining_time": "9h 5m 18s", "loss_scale": 1.0, "consumed_samples": 354048, "global_step/max_steps": "1383/6362"} +{"lm loss": 5.11251688, "grad_norm": 0.89057666, "learning_rate": 9.274e-05, "elapsed_time_per_iteration": 6.51953864, "memory(GiB)": 21.51, "elapsed_time": "2h 31m 34s", "remaining_time": "9h 5m 11s", "loss_scale": 1.0, "consumed_samples": 354304, "global_step/max_steps": "1384/6362"} +{"lm loss": 5.12316608, "grad_norm": 0.92025065, "learning_rate": 9.273e-05, "elapsed_time_per_iteration": 6.4647491, "memory(GiB)": 21.51, "elapsed_time": "2h 31m 40s", "remaining_time": "9h 5m 4s", "loss_scale": 1.0, "consumed_samples": 354560, "global_step/max_steps": "1385/6362"} +{"lm loss": 5.11092281, "grad_norm": 0.88810831, "learning_rate": 9.272e-05, "elapsed_time_per_iteration": 6.63962317, "memory(GiB)": 21.51, "elapsed_time": "2h 31m 47s", "remaining_time": "9h 4m 57s", "loss_scale": 1.0, "consumed_samples": 354816, "global_step/max_steps": "1386/6362"} +{"lm loss": 5.1160574, "grad_norm": 0.75890326, "learning_rate": 9.27e-05, "elapsed_time_per_iteration": 6.64600039, "memory(GiB)": 21.51, "elapsed_time": "2h 31m 54s", "remaining_time": "9h 4m 51s", "loss_scale": 1.0, "consumed_samples": 355072, "global_step/max_steps": "1387/6362"} +{"lm loss": 5.09202623, "grad_norm": 0.63395256, "learning_rate": 9.269e-05, "elapsed_time_per_iteration": 6.49139738, "memory(GiB)": 21.51, "elapsed_time": "2h 32m 0s", "remaining_time": "9h 4m 44s", "loss_scale": 1.0, "consumed_samples": 355328, "global_step/max_steps": "1388/6362"} +{"lm loss": 5.11376905, "grad_norm": 0.73714161, "learning_rate": 9.268e-05, "elapsed_time_per_iteration": 6.71363711, "memory(GiB)": 21.51, "elapsed_time": "2h 32m 7s", "remaining_time": "9h 4m 38s", "loss_scale": 1.0, "consumed_samples": 355584, "global_step/max_steps": "1389/6362"} +{"lm loss": 5.12366056, "grad_norm": 0.74740356, "learning_rate": 9.266e-05, "elapsed_time_per_iteration": 6.59933448, "memory(GiB)": 21.51, "elapsed_time": "2h 32m 14s", "remaining_time": "9h 4m 32s", "loss_scale": 1.0, "consumed_samples": 355840, "global_step/max_steps": "1390/6362"} +{"lm loss": 5.1085887, "grad_norm": 0.78408712, "learning_rate": 9.265e-05, "elapsed_time_per_iteration": 6.56535482, "memory(GiB)": 21.51, "elapsed_time": "2h 32m 20s", "remaining_time": "9h 4m 25s", "loss_scale": 1.0, "consumed_samples": 356096, "global_step/max_steps": "1391/6362"} +{"lm loss": 5.09011316, "grad_norm": 0.87326151, "learning_rate": 9.264e-05, "elapsed_time_per_iteration": 6.82599616, "memory(GiB)": 21.51, "elapsed_time": "2h 32m 27s", "remaining_time": "9h 4m 20s", "loss_scale": 1.0, "consumed_samples": 356352, "global_step/max_steps": "1392/6362"} +{"lm loss": 5.1086812, "grad_norm": 1.00953627, "learning_rate": 9.262e-05, "elapsed_time_per_iteration": 6.62208581, "memory(GiB)": 21.51, "elapsed_time": "2h 32m 34s", "remaining_time": "9h 4m 13s", "loss_scale": 1.0, "consumed_samples": 356608, "global_step/max_steps": "1393/6362"} +{"lm loss": 5.12487745, "grad_norm": 0.9594478, "learning_rate": 9.261e-05, "elapsed_time_per_iteration": 6.53567386, "memory(GiB)": 21.51, "elapsed_time": "2h 32m 40s", "remaining_time": "9h 4m 6s", "loss_scale": 1.0, "consumed_samples": 356864, "global_step/max_steps": "1394/6362"} +{"lm loss": 5.11447334, "grad_norm": 0.87180281, "learning_rate": 9.26e-05, "elapsed_time_per_iteration": 6.74087882, "memory(GiB)": 21.51, "elapsed_time": "2h 32m 47s", "remaining_time": "9h 4m 0s", "loss_scale": 1.0, "consumed_samples": 357120, "global_step/max_steps": "1395/6362"} +{"lm loss": 5.11247349, "grad_norm": 0.80551875, "learning_rate": 9.258e-05, "elapsed_time_per_iteration": 6.77819109, "memory(GiB)": 21.51, "elapsed_time": "2h 32m 54s", "remaining_time": "9h 3m 55s", "loss_scale": 1.0, "consumed_samples": 357376, "global_step/max_steps": "1396/6362"} +{"lm loss": 5.13909817, "grad_norm": 0.69579995, "learning_rate": 9.257e-05, "elapsed_time_per_iteration": 6.57355189, "memory(GiB)": 21.51, "elapsed_time": "2h 33m 0s", "remaining_time": "9h 3m 48s", "loss_scale": 1.0, "consumed_samples": 357632, "global_step/max_steps": "1397/6362"} +{"lm loss": 5.09440708, "grad_norm": 0.80523795, "learning_rate": 9.256e-05, "elapsed_time_per_iteration": 6.74750304, "memory(GiB)": 21.51, "elapsed_time": "2h 33m 7s", "remaining_time": "9h 3m 42s", "loss_scale": 1.0, "consumed_samples": 357888, "global_step/max_steps": "1398/6362"} +{"lm loss": 5.11351347, "grad_norm": 0.91801518, "learning_rate": 9.254e-05, "elapsed_time_per_iteration": 6.48940897, "memory(GiB)": 21.51, "elapsed_time": "2h 33m 13s", "remaining_time": "9h 3m 35s", "loss_scale": 1.0, "consumed_samples": 358144, "global_step/max_steps": "1399/6362"} +{"lm loss": 5.09775019, "grad_norm": 1.01766384, "learning_rate": 9.253e-05, "elapsed_time_per_iteration": 6.74070692, "memory(GiB)": 21.51, "elapsed_time": "2h 33m 20s", "remaining_time": "9h 3m 29s", "loss_scale": 1.0, "consumed_samples": 358400, "global_step/max_steps": "1400/6362"} +{"lm loss": 5.1089325, "grad_norm": 0.96958959, "learning_rate": 9.252e-05, "elapsed_time_per_iteration": 6.59398174, "memory(GiB)": 21.51, "elapsed_time": "2h 33m 27s", "remaining_time": "9h 3m 23s", "loss_scale": 1.0, "consumed_samples": 358656, "global_step/max_steps": "1401/6362"} +{"lm loss": 5.12630081, "grad_norm": 1.00742924, "learning_rate": 9.25e-05, "elapsed_time_per_iteration": 6.62044382, "memory(GiB)": 21.51, "elapsed_time": "2h 33m 33s", "remaining_time": "9h 3m 16s", "loss_scale": 1.0, "consumed_samples": 358912, "global_step/max_steps": "1402/6362"} +{"lm loss": 5.11561775, "grad_norm": 1.12322056, "learning_rate": 9.249e-05, "elapsed_time_per_iteration": 6.58814311, "memory(GiB)": 21.51, "elapsed_time": "2h 33m 40s", "remaining_time": "9h 3m 10s", "loss_scale": 1.0, "consumed_samples": 359168, "global_step/max_steps": "1403/6362"} +{"lm loss": 5.09317207, "grad_norm": 0.95163441, "learning_rate": 9.248e-05, "elapsed_time_per_iteration": 6.56748438, "memory(GiB)": 21.51, "elapsed_time": "2h 33m 47s", "remaining_time": "9h 3m 3s", "loss_scale": 1.0, "consumed_samples": 359424, "global_step/max_steps": "1404/6362"} +{"lm loss": 5.13865089, "grad_norm": 0.81248796, "learning_rate": 9.246e-05, "elapsed_time_per_iteration": 6.65627027, "memory(GiB)": 21.51, "elapsed_time": "2h 33m 53s", "remaining_time": "9h 2m 57s", "loss_scale": 1.0, "consumed_samples": 359680, "global_step/max_steps": "1405/6362"} +{"lm loss": 5.10578823, "grad_norm": 0.78057104, "learning_rate": 9.245e-05, "elapsed_time_per_iteration": 6.6706109, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 0s", "remaining_time": "9h 2m 51s", "loss_scale": 1.0, "consumed_samples": 359936, "global_step/max_steps": "1406/6362"} +{"lm loss": 5.10756302, "grad_norm": 0.89086813, "learning_rate": 9.244e-05, "elapsed_time_per_iteration": 6.54604459, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 6s", "remaining_time": "9h 2m 44s", "loss_scale": 1.0, "consumed_samples": 360192, "global_step/max_steps": "1407/6362"} +{"lm loss": 5.09944105, "grad_norm": 0.90256584, "learning_rate": 9.242e-05, "elapsed_time_per_iteration": 6.65979028, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 13s", "remaining_time": "9h 2m 38s", "loss_scale": 1.0, "consumed_samples": 360448, "global_step/max_steps": "1408/6362"} +{"lm loss": 5.10889101, "grad_norm": 0.77602887, "learning_rate": 9.241e-05, "elapsed_time_per_iteration": 6.4573791, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 20s", "remaining_time": "9h 2m 31s", "loss_scale": 1.0, "consumed_samples": 360704, "global_step/max_steps": "1409/6362"} +{"lm loss": 5.10460949, "grad_norm": 0.76193577, "learning_rate": 9.24e-05, "elapsed_time_per_iteration": 6.77318764, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 26s", "remaining_time": "9h 2m 25s", "loss_scale": 1.0, "consumed_samples": 360960, "global_step/max_steps": "1410/6362"} +{"lm loss": 5.1286211, "grad_norm": 0.80718923, "learning_rate": 9.238e-05, "elapsed_time_per_iteration": 6.51204538, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 33s", "remaining_time": "9h 2m 18s", "loss_scale": 1.0, "consumed_samples": 361216, "global_step/max_steps": "1411/6362"} +{"lm loss": 5.1089468, "grad_norm": 0.82584584, "learning_rate": 9.237e-05, "elapsed_time_per_iteration": 6.48794222, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 39s", "remaining_time": "9h 2m 11s", "loss_scale": 1.0, "consumed_samples": 361472, "global_step/max_steps": "1412/6362"} +{"lm loss": 5.09387875, "grad_norm": 0.85703206, "learning_rate": 9.236e-05, "elapsed_time_per_iteration": 6.47747278, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 46s", "remaining_time": "9h 2m 4s", "loss_scale": 1.0, "consumed_samples": 361728, "global_step/max_steps": "1413/6362"} +{"lm loss": 5.10869122, "grad_norm": 0.81264919, "learning_rate": 9.234e-05, "elapsed_time_per_iteration": 6.50061727, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 52s", "remaining_time": "9h 1m 58s", "loss_scale": 1.0, "consumed_samples": 361984, "global_step/max_steps": "1414/6362"} +{"lm loss": 5.11513424, "grad_norm": 0.70537645, "learning_rate": 9.233e-05, "elapsed_time_per_iteration": 6.58112335, "memory(GiB)": 21.51, "elapsed_time": "2h 34m 59s", "remaining_time": "9h 1m 51s", "loss_scale": 1.0, "consumed_samples": 362240, "global_step/max_steps": "1415/6362"} +{"lm loss": 5.09028387, "grad_norm": 0.73411238, "learning_rate": 9.231e-05, "elapsed_time_per_iteration": 6.5532887, "memory(GiB)": 21.51, "elapsed_time": "2h 35m 5s", "remaining_time": "9h 1m 44s", "loss_scale": 1.0, "consumed_samples": 362496, "global_step/max_steps": "1416/6362"} +{"lm loss": 5.11390209, "grad_norm": 0.85988832, "learning_rate": 9.23e-05, "elapsed_time_per_iteration": 6.86087227, "memory(GiB)": 21.51, "elapsed_time": "2h 35m 12s", "remaining_time": "9h 1m 39s", "loss_scale": 1.0, "consumed_samples": 362752, "global_step/max_steps": "1417/6362"} +{"lm loss": 5.10847807, "grad_norm": 1.0038631, "learning_rate": 9.229e-05, "elapsed_time_per_iteration": 6.63440013, "memory(GiB)": 21.51, "elapsed_time": "2h 35m 19s", "remaining_time": "9h 1m 33s", "loss_scale": 1.0, "consumed_samples": 363008, "global_step/max_steps": "1418/6362"} +{"lm loss": 5.08561134, "grad_norm": 1.1561178, "learning_rate": 9.227e-05, "elapsed_time_per_iteration": 6.7015295, "memory(GiB)": 21.51, "elapsed_time": "2h 35m 26s", "remaining_time": "9h 1m 26s", "loss_scale": 1.0, "consumed_samples": 363264, "global_step/max_steps": "1419/6362"} +{"lm loss": 5.10316658, "grad_norm": 0.83656234, "learning_rate": 9.226e-05, "elapsed_time_per_iteration": 6.65939999, "memory(GiB)": 21.51, "elapsed_time": "2h 35m 32s", "remaining_time": "9h 1m 20s", "loss_scale": 1.0, "consumed_samples": 363520, "global_step/max_steps": "1420/6362"} +{"lm loss": 5.10808086, "grad_norm": 0.76325452, "learning_rate": 9.225e-05, "elapsed_time_per_iteration": 6.60847378, "memory(GiB)": 21.51, "elapsed_time": "2h 35m 39s", "remaining_time": "9h 1m 14s", "loss_scale": 1.0, "consumed_samples": 363776, "global_step/max_steps": "1421/6362"} +{"lm loss": 5.11870623, "grad_norm": 0.93882024, "learning_rate": 9.223e-05, "elapsed_time_per_iteration": 6.6637547, "memory(GiB)": 21.51, "elapsed_time": "2h 35m 46s", "remaining_time": "9h 1m 7s", "loss_scale": 1.0, "consumed_samples": 364032, "global_step/max_steps": "1422/6362"} +{"lm loss": 5.11222124, "grad_norm": 0.93988144, "learning_rate": 9.222e-05, "elapsed_time_per_iteration": 6.61888242, "memory(GiB)": 21.51, "elapsed_time": "2h 35m 52s", "remaining_time": "9h 1m 1s", "loss_scale": 1.0, "consumed_samples": 364288, "global_step/max_steps": "1423/6362"} +{"lm loss": 5.11368275, "grad_norm": 0.82899976, "learning_rate": 9.22e-05, "elapsed_time_per_iteration": 6.49345851, "memory(GiB)": 21.51, "elapsed_time": "2h 35m 59s", "remaining_time": "9h 0m 54s", "loss_scale": 1.0, "consumed_samples": 364544, "global_step/max_steps": "1424/6362"} +{"lm loss": 5.11649179, "grad_norm": 0.730506, "learning_rate": 9.219e-05, "elapsed_time_per_iteration": 6.86011195, "memory(GiB)": 21.51, "elapsed_time": "2h 36m 6s", "remaining_time": "9h 0m 49s", "loss_scale": 1.0, "consumed_samples": 364800, "global_step/max_steps": "1425/6362"} +{"lm loss": 5.0821476, "grad_norm": 0.83748388, "learning_rate": 9.218e-05, "elapsed_time_per_iteration": 6.68402576, "memory(GiB)": 21.51, "elapsed_time": "2h 36m 12s", "remaining_time": "9h 0m 42s", "loss_scale": 1.0, "consumed_samples": 365056, "global_step/max_steps": "1426/6362"} +{"lm loss": 5.09603405, "grad_norm": 0.99727219, "learning_rate": 9.216e-05, "elapsed_time_per_iteration": 6.74671602, "memory(GiB)": 21.51, "elapsed_time": "2h 36m 19s", "remaining_time": "9h 0m 36s", "loss_scale": 1.0, "consumed_samples": 365312, "global_step/max_steps": "1427/6362"} +{"lm loss": 5.08748674, "grad_norm": 0.91799551, "learning_rate": 9.215e-05, "elapsed_time_per_iteration": 6.63080883, "memory(GiB)": 21.51, "elapsed_time": "2h 36m 26s", "remaining_time": "9h 0m 30s", "loss_scale": 1.0, "consumed_samples": 365568, "global_step/max_steps": "1428/6362"} +{"lm loss": 5.12547207, "grad_norm": 0.86612928, "learning_rate": 9.214e-05, "elapsed_time_per_iteration": 6.72756696, "memory(GiB)": 21.51, "elapsed_time": "2h 36m 32s", "remaining_time": "9h 0m 24s", "loss_scale": 1.0, "consumed_samples": 365824, "global_step/max_steps": "1429/6362"} +{"lm loss": 5.10282183, "grad_norm": 0.78044271, "learning_rate": 9.212e-05, "elapsed_time_per_iteration": 6.49152446, "memory(GiB)": 21.51, "elapsed_time": "2h 36m 39s", "remaining_time": "9h 0m 17s", "loss_scale": 1.0, "consumed_samples": 366080, "global_step/max_steps": "1430/6362"} +{"lm loss": 5.10913086, "grad_norm": 0.75406009, "learning_rate": 9.211e-05, "elapsed_time_per_iteration": 6.48987079, "memory(GiB)": 21.51, "elapsed_time": "2h 36m 45s", "remaining_time": "9h 0m 10s", "loss_scale": 1.0, "consumed_samples": 366336, "global_step/max_steps": "1431/6362"} +{"lm loss": 5.08322239, "grad_norm": 0.71784711, "learning_rate": 9.209e-05, "elapsed_time_per_iteration": 6.89703226, "memory(GiB)": 21.51, "elapsed_time": "2h 36m 52s", "remaining_time": "9h 0m 5s", "loss_scale": 1.0, "consumed_samples": 366592, "global_step/max_steps": "1432/6362"} +{"lm loss": 5.09941339, "grad_norm": 0.72629756, "learning_rate": 9.208e-05, "elapsed_time_per_iteration": 6.63960266, "memory(GiB)": 21.51, "elapsed_time": "2h 36m 59s", "remaining_time": "8h 59m 59s", "loss_scale": 1.0, "consumed_samples": 366848, "global_step/max_steps": "1433/6362"} +{"lm loss": 5.08938932, "grad_norm": 0.8069489, "learning_rate": 9.207e-05, "elapsed_time_per_iteration": 6.66383481, "memory(GiB)": 21.51, "elapsed_time": "2h 37m 5s", "remaining_time": "8h 59m 52s", "loss_scale": 1.0, "consumed_samples": 367104, "global_step/max_steps": "1434/6362"} +{"lm loss": 5.11172199, "grad_norm": 0.86345196, "learning_rate": 9.205e-05, "elapsed_time_per_iteration": 6.66243291, "memory(GiB)": 21.51, "elapsed_time": "2h 37m 12s", "remaining_time": "8h 59m 46s", "loss_scale": 1.0, "consumed_samples": 367360, "global_step/max_steps": "1435/6362"} +{"lm loss": 5.08427286, "grad_norm": 0.94377577, "learning_rate": 9.204e-05, "elapsed_time_per_iteration": 6.5132401, "memory(GiB)": 21.51, "elapsed_time": "2h 37m 19s", "remaining_time": "8h 59m 39s", "loss_scale": 1.0, "consumed_samples": 367616, "global_step/max_steps": "1436/6362"} +{"lm loss": 5.10665751, "grad_norm": 0.9444232, "learning_rate": 9.203e-05, "elapsed_time_per_iteration": 6.54355955, "memory(GiB)": 21.51, "elapsed_time": "2h 37m 25s", "remaining_time": "8h 59m 33s", "loss_scale": 1.0, "consumed_samples": 367872, "global_step/max_steps": "1437/6362"} +{"lm loss": 5.0935874, "grad_norm": 1.0288471, "learning_rate": 9.201e-05, "elapsed_time_per_iteration": 6.56642437, "memory(GiB)": 21.51, "elapsed_time": "2h 37m 32s", "remaining_time": "8h 59m 26s", "loss_scale": 1.0, "consumed_samples": 368128, "global_step/max_steps": "1438/6362"} +{"lm loss": 5.10684967, "grad_norm": 0.99177903, "learning_rate": 9.2e-05, "elapsed_time_per_iteration": 6.50999117, "memory(GiB)": 21.51, "elapsed_time": "2h 37m 38s", "remaining_time": "8h 59m 19s", "loss_scale": 1.0, "consumed_samples": 368384, "global_step/max_steps": "1439/6362"} +{"lm loss": 5.09375668, "grad_norm": 0.83125758, "learning_rate": 9.198e-05, "elapsed_time_per_iteration": 6.53790998, "memory(GiB)": 21.51, "elapsed_time": "2h 37m 45s", "remaining_time": "8h 59m 12s", "loss_scale": 1.0, "consumed_samples": 368640, "global_step/max_steps": "1440/6362"} +{"lm loss": 5.09758091, "grad_norm": 0.69798195, "learning_rate": 9.197e-05, "elapsed_time_per_iteration": 6.62743282, "memory(GiB)": 21.51, "elapsed_time": "2h 37m 51s", "remaining_time": "8h 59m 6s", "loss_scale": 1.0, "consumed_samples": 368896, "global_step/max_steps": "1441/6362"} +{"lm loss": 5.10724354, "grad_norm": 0.79698926, "learning_rate": 9.196e-05, "elapsed_time_per_iteration": 6.44962335, "memory(GiB)": 21.51, "elapsed_time": "2h 37m 58s", "remaining_time": "8h 58m 59s", "loss_scale": 1.0, "consumed_samples": 369152, "global_step/max_steps": "1442/6362"} +{"lm loss": 5.11377716, "grad_norm": 0.76706648, "learning_rate": 9.194e-05, "elapsed_time_per_iteration": 6.82162642, "memory(GiB)": 21.51, "elapsed_time": "2h 38m 5s", "remaining_time": "8h 58m 53s", "loss_scale": 1.0, "consumed_samples": 369408, "global_step/max_steps": "1443/6362"} +{"lm loss": 5.10533762, "grad_norm": 0.766132, "learning_rate": 9.193e-05, "elapsed_time_per_iteration": 6.69148397, "memory(GiB)": 21.51, "elapsed_time": "2h 38m 11s", "remaining_time": "8h 58m 47s", "loss_scale": 1.0, "consumed_samples": 369664, "global_step/max_steps": "1444/6362"} +{"lm loss": 5.08355522, "grad_norm": 0.63429958, "learning_rate": 9.191e-05, "elapsed_time_per_iteration": 6.47653651, "memory(GiB)": 21.51, "elapsed_time": "2h 38m 18s", "remaining_time": "8h 58m 40s", "loss_scale": 1.0, "consumed_samples": 369920, "global_step/max_steps": "1445/6362"} +{"lm loss": 5.08396912, "grad_norm": 0.59976363, "learning_rate": 9.19e-05, "elapsed_time_per_iteration": 6.52624559, "memory(GiB)": 21.51, "elapsed_time": "2h 38m 24s", "remaining_time": "8h 58m 34s", "loss_scale": 1.0, "consumed_samples": 370176, "global_step/max_steps": "1446/6362"} +{"lm loss": 5.09241009, "grad_norm": 0.70999998, "learning_rate": 9.189e-05, "elapsed_time_per_iteration": 6.64007187, "memory(GiB)": 21.51, "elapsed_time": "2h 38m 31s", "remaining_time": "8h 58m 27s", "loss_scale": 1.0, "consumed_samples": 370432, "global_step/max_steps": "1447/6362"} +{"lm loss": 5.10449171, "grad_norm": 0.82108206, "learning_rate": 9.187e-05, "elapsed_time_per_iteration": 6.50152707, "memory(GiB)": 21.51, "elapsed_time": "2h 38m 38s", "remaining_time": "8h 58m 20s", "loss_scale": 1.0, "consumed_samples": 370688, "global_step/max_steps": "1448/6362"} +{"lm loss": 5.09143209, "grad_norm": 0.94536865, "learning_rate": 9.186e-05, "elapsed_time_per_iteration": 6.61009836, "memory(GiB)": 21.51, "elapsed_time": "2h 38m 44s", "remaining_time": "8h 58m 14s", "loss_scale": 1.0, "consumed_samples": 370944, "global_step/max_steps": "1449/6362"} +{"lm loss": 5.10688782, "grad_norm": 1.05160928, "learning_rate": 9.184e-05, "elapsed_time_per_iteration": 6.78647351, "memory(GiB)": 21.51, "elapsed_time": "2h 38m 51s", "remaining_time": "8h 58m 8s", "loss_scale": 1.0, "consumed_samples": 371200, "global_step/max_steps": "1450/6362"} +{"lm loss": 5.09887218, "grad_norm": 1.00433207, "learning_rate": 9.183e-05, "elapsed_time_per_iteration": 6.48591995, "memory(GiB)": 21.51, "elapsed_time": "2h 38m 57s", "remaining_time": "8h 58m 1s", "loss_scale": 1.0, "consumed_samples": 371456, "global_step/max_steps": "1451/6362"} +{"lm loss": 5.08797026, "grad_norm": 0.99239928, "learning_rate": 9.182e-05, "elapsed_time_per_iteration": 6.63595867, "memory(GiB)": 21.51, "elapsed_time": "2h 39m 4s", "remaining_time": "8h 57m 55s", "loss_scale": 1.0, "consumed_samples": 371712, "global_step/max_steps": "1452/6362"} +{"lm loss": 5.07280827, "grad_norm": 0.91520715, "learning_rate": 9.18e-05, "elapsed_time_per_iteration": 6.65839434, "memory(GiB)": 21.51, "elapsed_time": "2h 39m 11s", "remaining_time": "8h 57m 49s", "loss_scale": 1.0, "consumed_samples": 371968, "global_step/max_steps": "1453/6362"} +{"lm loss": 5.09209538, "grad_norm": 0.79005659, "learning_rate": 9.179e-05, "elapsed_time_per_iteration": 6.54908013, "memory(GiB)": 21.51, "elapsed_time": "2h 39m 17s", "remaining_time": "8h 57m 42s", "loss_scale": 1.0, "consumed_samples": 372224, "global_step/max_steps": "1454/6362"} +{"lm loss": 5.10811424, "grad_norm": 0.77726632, "learning_rate": 9.177e-05, "elapsed_time_per_iteration": 6.55659556, "memory(GiB)": 21.51, "elapsed_time": "2h 39m 24s", "remaining_time": "8h 57m 35s", "loss_scale": 1.0, "consumed_samples": 372480, "global_step/max_steps": "1455/6362"} +{"lm loss": 5.09038639, "grad_norm": 0.862589, "learning_rate": 9.176e-05, "elapsed_time_per_iteration": 6.52341652, "memory(GiB)": 21.51, "elapsed_time": "2h 39m 30s", "remaining_time": "8h 57m 29s", "loss_scale": 1.0, "consumed_samples": 372736, "global_step/max_steps": "1456/6362"} +{"lm loss": 5.11225319, "grad_norm": 0.84218329, "learning_rate": 9.175e-05, "elapsed_time_per_iteration": 6.49173737, "memory(GiB)": 21.51, "elapsed_time": "2h 39m 37s", "remaining_time": "8h 57m 22s", "loss_scale": 1.0, "consumed_samples": 372992, "global_step/max_steps": "1457/6362"} +{"lm loss": 5.10195637, "grad_norm": 0.88412684, "learning_rate": 9.173e-05, "elapsed_time_per_iteration": 6.42476249, "memory(GiB)": 21.51, "elapsed_time": "2h 39m 43s", "remaining_time": "8h 57m 15s", "loss_scale": 1.0, "consumed_samples": 373248, "global_step/max_steps": "1458/6362"} +{"lm loss": 5.08605289, "grad_norm": 0.99792522, "learning_rate": 9.172e-05, "elapsed_time_per_iteration": 6.48910284, "memory(GiB)": 21.51, "elapsed_time": "2h 39m 50s", "remaining_time": "8h 57m 8s", "loss_scale": 1.0, "consumed_samples": 373504, "global_step/max_steps": "1459/6362"} +{"lm loss": 5.11468601, "grad_norm": 0.77517194, "learning_rate": 9.17e-05, "elapsed_time_per_iteration": 6.37432981, "memory(GiB)": 21.51, "elapsed_time": "2h 39m 56s", "remaining_time": "8h 57m 1s", "loss_scale": 1.0, "consumed_samples": 373760, "global_step/max_steps": "1460/6362"} +{"lm loss": 5.10338497, "grad_norm": 0.60996133, "learning_rate": 9.169e-05, "elapsed_time_per_iteration": 6.29918933, "memory(GiB)": 21.51, "elapsed_time": "2h 40m 2s", "remaining_time": "8h 56m 53s", "loss_scale": 1.0, "consumed_samples": 374016, "global_step/max_steps": "1461/6362"} +{"lm loss": 5.09481955, "grad_norm": 0.73878831, "learning_rate": 9.168e-05, "elapsed_time_per_iteration": 6.58241343, "memory(GiB)": 21.51, "elapsed_time": "2h 40m 9s", "remaining_time": "8h 56m 47s", "loss_scale": 1.0, "consumed_samples": 374272, "global_step/max_steps": "1462/6362"} +{"lm loss": 5.10293388, "grad_norm": 0.7531414, "learning_rate": 9.166e-05, "elapsed_time_per_iteration": 6.7668457, "memory(GiB)": 21.51, "elapsed_time": "2h 40m 16s", "remaining_time": "8h 56m 41s", "loss_scale": 1.0, "consumed_samples": 374528, "global_step/max_steps": "1463/6362"} +{"lm loss": 5.11917305, "grad_norm": 0.72942841, "learning_rate": 9.165e-05, "elapsed_time_per_iteration": 6.56698442, "memory(GiB)": 21.51, "elapsed_time": "2h 40m 22s", "remaining_time": "8h 56m 34s", "loss_scale": 1.0, "consumed_samples": 374784, "global_step/max_steps": "1464/6362"} +{"lm loss": 5.08250999, "grad_norm": 0.75153989, "learning_rate": 9.163e-05, "elapsed_time_per_iteration": 6.48449302, "memory(GiB)": 21.51, "elapsed_time": "2h 40m 29s", "remaining_time": "8h 56m 27s", "loss_scale": 1.0, "consumed_samples": 375040, "global_step/max_steps": "1465/6362"} +{"lm loss": 5.08343124, "grad_norm": 0.80233687, "learning_rate": 9.162e-05, "elapsed_time_per_iteration": 6.72456026, "memory(GiB)": 21.51, "elapsed_time": "2h 40m 36s", "remaining_time": "8h 56m 21s", "loss_scale": 1.0, "consumed_samples": 375296, "global_step/max_steps": "1466/6362"} +{"lm loss": 5.06405783, "grad_norm": 0.87789935, "learning_rate": 9.161e-05, "elapsed_time_per_iteration": 6.69761682, "memory(GiB)": 21.51, "elapsed_time": "2h 40m 42s", "remaining_time": "8h 56m 15s", "loss_scale": 1.0, "consumed_samples": 375552, "global_step/max_steps": "1467/6362"} +{"lm loss": 5.08729029, "grad_norm": 0.93764472, "learning_rate": 9.159e-05, "elapsed_time_per_iteration": 6.51446867, "memory(GiB)": 21.51, "elapsed_time": "2h 40m 49s", "remaining_time": "8h 56m 8s", "loss_scale": 1.0, "consumed_samples": 375808, "global_step/max_steps": "1468/6362"} +{"lm loss": 5.11056995, "grad_norm": 0.98646629, "learning_rate": 9.158e-05, "elapsed_time_per_iteration": 6.46861696, "memory(GiB)": 21.51, "elapsed_time": "2h 40m 55s", "remaining_time": "8h 56m 1s", "loss_scale": 1.0, "consumed_samples": 376064, "global_step/max_steps": "1469/6362"} +{"lm loss": 5.10161591, "grad_norm": 0.91271281, "learning_rate": 9.156e-05, "elapsed_time_per_iteration": 6.61705422, "memory(GiB)": 21.51, "elapsed_time": "2h 41m 2s", "remaining_time": "8h 55m 55s", "loss_scale": 1.0, "consumed_samples": 376320, "global_step/max_steps": "1470/6362"} +{"lm loss": 5.07628393, "grad_norm": 0.87659514, "learning_rate": 9.155e-05, "elapsed_time_per_iteration": 6.52350926, "memory(GiB)": 21.51, "elapsed_time": "2h 41m 8s", "remaining_time": "8h 55m 48s", "loss_scale": 1.0, "consumed_samples": 376576, "global_step/max_steps": "1471/6362"} +{"lm loss": 5.08677435, "grad_norm": 0.99515843, "learning_rate": 9.153e-05, "elapsed_time_per_iteration": 6.65477991, "memory(GiB)": 21.51, "elapsed_time": "2h 41m 15s", "remaining_time": "8h 55m 42s", "loss_scale": 1.0, "consumed_samples": 376832, "global_step/max_steps": "1472/6362"} +{"lm loss": 5.12070227, "grad_norm": 1.12329245, "learning_rate": 9.152e-05, "elapsed_time_per_iteration": 6.69044614, "memory(GiB)": 21.51, "elapsed_time": "2h 41m 22s", "remaining_time": "8h 55m 36s", "loss_scale": 1.0, "consumed_samples": 377088, "global_step/max_steps": "1473/6362"} +{"lm loss": 5.10512924, "grad_norm": 0.78522807, "learning_rate": 9.151e-05, "elapsed_time_per_iteration": 6.51542544, "memory(GiB)": 21.51, "elapsed_time": "2h 41m 28s", "remaining_time": "8h 55m 29s", "loss_scale": 1.0, "consumed_samples": 377344, "global_step/max_steps": "1474/6362"} +{"lm loss": 5.08852768, "grad_norm": 0.82221252, "learning_rate": 9.149e-05, "elapsed_time_per_iteration": 6.6715014, "memory(GiB)": 21.51, "elapsed_time": "2h 41m 35s", "remaining_time": "8h 55m 23s", "loss_scale": 1.0, "consumed_samples": 377600, "global_step/max_steps": "1475/6362"} +{"lm loss": 5.10128689, "grad_norm": 1.03770208, "learning_rate": 9.148e-05, "elapsed_time_per_iteration": 6.75665808, "memory(GiB)": 21.51, "elapsed_time": "2h 41m 42s", "remaining_time": "8h 55m 17s", "loss_scale": 1.0, "consumed_samples": 377856, "global_step/max_steps": "1476/6362"} +{"lm loss": 5.07438564, "grad_norm": 0.89763534, "learning_rate": 9.146e-05, "elapsed_time_per_iteration": 6.52827406, "memory(GiB)": 21.51, "elapsed_time": "2h 41m 48s", "remaining_time": "8h 55m 10s", "loss_scale": 1.0, "consumed_samples": 378112, "global_step/max_steps": "1477/6362"} +{"lm loss": 5.09627581, "grad_norm": 0.71166575, "learning_rate": 9.145e-05, "elapsed_time_per_iteration": 6.57392335, "memory(GiB)": 21.51, "elapsed_time": "2h 41m 55s", "remaining_time": "8h 55m 3s", "loss_scale": 1.0, "consumed_samples": 378368, "global_step/max_steps": "1478/6362"} +{"lm loss": 5.08993483, "grad_norm": 0.73955023, "learning_rate": 9.143e-05, "elapsed_time_per_iteration": 6.36458111, "memory(GiB)": 21.51, "elapsed_time": "2h 42m 1s", "remaining_time": "8h 54m 56s", "loss_scale": 1.0, "consumed_samples": 378624, "global_step/max_steps": "1479/6362"} +{"lm loss": 5.10072136, "grad_norm": 0.67096591, "learning_rate": 9.142e-05, "elapsed_time_per_iteration": 6.60516071, "memory(GiB)": 21.51, "elapsed_time": "2h 42m 8s", "remaining_time": "8h 54m 50s", "loss_scale": 1.0, "consumed_samples": 378880, "global_step/max_steps": "1480/6362"} +{"lm loss": 5.08456612, "grad_norm": 0.61519957, "learning_rate": 9.141e-05, "elapsed_time_per_iteration": 6.4204092, "memory(GiB)": 21.51, "elapsed_time": "2h 42m 14s", "remaining_time": "8h 54m 42s", "loss_scale": 1.0, "consumed_samples": 379136, "global_step/max_steps": "1481/6362"} +{"lm loss": 5.10519314, "grad_norm": 0.70971364, "learning_rate": 9.139e-05, "elapsed_time_per_iteration": 6.60373974, "memory(GiB)": 21.51, "elapsed_time": "2h 42m 21s", "remaining_time": "8h 54m 36s", "loss_scale": 1.0, "consumed_samples": 379392, "global_step/max_steps": "1482/6362"} +{"lm loss": 5.10134935, "grad_norm": 0.83653724, "learning_rate": 9.138e-05, "elapsed_time_per_iteration": 6.51608753, "memory(GiB)": 21.51, "elapsed_time": "2h 42m 27s", "remaining_time": "8h 54m 29s", "loss_scale": 1.0, "consumed_samples": 379648, "global_step/max_steps": "1483/6362"} +{"lm loss": 5.07729673, "grad_norm": 0.84105277, "learning_rate": 9.136e-05, "elapsed_time_per_iteration": 6.41512656, "memory(GiB)": 21.51, "elapsed_time": "2h 42m 34s", "remaining_time": "8h 54m 22s", "loss_scale": 1.0, "consumed_samples": 379904, "global_step/max_steps": "1484/6362"} +{"lm loss": 5.0775075, "grad_norm": 0.77463049, "learning_rate": 9.135e-05, "elapsed_time_per_iteration": 6.50500917, "memory(GiB)": 21.51, "elapsed_time": "2h 42m 40s", "remaining_time": "8h 54m 15s", "loss_scale": 1.0, "consumed_samples": 380160, "global_step/max_steps": "1485/6362"} +{"lm loss": 5.11291361, "grad_norm": 0.92267251, "learning_rate": 9.133e-05, "elapsed_time_per_iteration": 6.28414774, "memory(GiB)": 21.51, "elapsed_time": "2h 42m 46s", "remaining_time": "8h 54m 8s", "loss_scale": 1.0, "consumed_samples": 380416, "global_step/max_steps": "1486/6362"} +{"lm loss": 5.10729456, "grad_norm": 1.16497886, "learning_rate": 9.132e-05, "elapsed_time_per_iteration": 6.86925006, "memory(GiB)": 21.51, "elapsed_time": "2h 42m 53s", "remaining_time": "8h 54m 2s", "loss_scale": 1.0, "consumed_samples": 380672, "global_step/max_steps": "1487/6362"} +{"lm loss": 5.0778842, "grad_norm": 0.84497422, "learning_rate": 9.131e-05, "elapsed_time_per_iteration": 6.47468448, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 0s", "remaining_time": "8h 53m 55s", "loss_scale": 1.0, "consumed_samples": 380928, "global_step/max_steps": "1488/6362"} +{"lm loss": 5.09587002, "grad_norm": 0.7077325, "learning_rate": 9.129e-05, "elapsed_time_per_iteration": 6.65108514, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 6s", "remaining_time": "8h 53m 49s", "loss_scale": 1.0, "consumed_samples": 381184, "global_step/max_steps": "1489/6362"} +{"lm loss": 5.08360338, "grad_norm": 0.82868242, "learning_rate": 9.128e-05, "elapsed_time_per_iteration": 6.88303661, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 13s", "remaining_time": "8h 53m 43s", "loss_scale": 1.0, "consumed_samples": 381440, "global_step/max_steps": "1490/6362"} +{"lm loss": 5.05529976, "grad_norm": 0.99473631, "learning_rate": 9.126e-05, "elapsed_time_per_iteration": 6.53069401, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 20s", "remaining_time": "8h 53m 37s", "loss_scale": 1.0, "consumed_samples": 381696, "global_step/max_steps": "1491/6362"} +{"lm loss": 5.09470081, "grad_norm": 0.99431837, "learning_rate": 9.125e-05, "elapsed_time_per_iteration": 6.49102783, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 26s", "remaining_time": "8h 53m 30s", "loss_scale": 1.0, "consumed_samples": 381952, "global_step/max_steps": "1492/6362"} +{"lm loss": 5.09201956, "grad_norm": 0.9452365, "learning_rate": 9.123e-05, "elapsed_time_per_iteration": 6.57441139, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 33s", "remaining_time": "8h 53m 23s", "loss_scale": 1.0, "consumed_samples": 382208, "global_step/max_steps": "1493/6362"} +{"lm loss": 5.08350945, "grad_norm": 1.02424991, "learning_rate": 9.122e-05, "elapsed_time_per_iteration": 6.32129192, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 39s", "remaining_time": "8h 53m 16s", "loss_scale": 1.0, "consumed_samples": 382464, "global_step/max_steps": "1494/6362"} +{"lm loss": 5.08778715, "grad_norm": 1.08150101, "learning_rate": 9.12e-05, "elapsed_time_per_iteration": 6.68907738, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 46s", "remaining_time": "8h 53m 10s", "loss_scale": 1.0, "consumed_samples": 382720, "global_step/max_steps": "1495/6362"} +{"lm loss": 5.09212828, "grad_norm": 0.80999154, "learning_rate": 9.119e-05, "elapsed_time_per_iteration": 6.57361817, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 53s", "remaining_time": "8h 53m 3s", "loss_scale": 1.0, "consumed_samples": 382976, "global_step/max_steps": "1496/6362"} +{"lm loss": 5.09878016, "grad_norm": 0.73554838, "learning_rate": 9.118e-05, "elapsed_time_per_iteration": 6.6271174, "memory(GiB)": 21.51, "elapsed_time": "2h 43m 59s", "remaining_time": "8h 52m 57s", "loss_scale": 1.0, "consumed_samples": 383232, "global_step/max_steps": "1497/6362"} +{"lm loss": 5.0808959, "grad_norm": 0.8514716, "learning_rate": 9.116e-05, "elapsed_time_per_iteration": 6.6161449, "memory(GiB)": 21.51, "elapsed_time": "2h 44m 6s", "remaining_time": "8h 52m 50s", "loss_scale": 1.0, "consumed_samples": 383488, "global_step/max_steps": "1498/6362"} +{"lm loss": 5.06182909, "grad_norm": 0.76287764, "learning_rate": 9.115e-05, "elapsed_time_per_iteration": 6.67692542, "memory(GiB)": 21.51, "elapsed_time": "2h 44m 12s", "remaining_time": "8h 52m 44s", "loss_scale": 1.0, "consumed_samples": 383744, "global_step/max_steps": "1499/6362"} +{"lm loss": 5.07684374, "grad_norm": 0.67923981, "learning_rate": 9.113e-05, "elapsed_time_per_iteration": 6.59046721, "memory(GiB)": 21.51, "elapsed_time": "2h 44m 19s", "remaining_time": "8h 52m 38s", "loss_scale": 1.0, "consumed_samples": 384000, "global_step/max_steps": "1500/6362"} +{"lm loss": 5.10265303, "grad_norm": 0.66153729, "learning_rate": 9.112e-05, "elapsed_time_per_iteration": 6.65319943, "memory(GiB)": 21.51, "elapsed_time": "2h 44m 26s", "remaining_time": "8h 52m 31s", "loss_scale": 1.0, "consumed_samples": 384256, "global_step/max_steps": "1501/6362"} +{"lm loss": 5.10752535, "grad_norm": 0.7588346, "learning_rate": 9.11e-05, "elapsed_time_per_iteration": 6.54167891, "memory(GiB)": 21.51, "elapsed_time": "2h 44m 32s", "remaining_time": "8h 52m 25s", "loss_scale": 1.0, "consumed_samples": 384512, "global_step/max_steps": "1502/6362"} +{"lm loss": 5.09027052, "grad_norm": 0.65378165, "learning_rate": 9.109e-05, "elapsed_time_per_iteration": 6.43516898, "memory(GiB)": 21.51, "elapsed_time": "2h 44m 39s", "remaining_time": "8h 52m 18s", "loss_scale": 1.0, "consumed_samples": 384768, "global_step/max_steps": "1503/6362"} +{"lm loss": 5.06448364, "grad_norm": 0.60813481, "learning_rate": 9.107e-05, "elapsed_time_per_iteration": 6.42264247, "memory(GiB)": 21.51, "elapsed_time": "2h 44m 45s", "remaining_time": "8h 52m 11s", "loss_scale": 1.0, "consumed_samples": 385024, "global_step/max_steps": "1504/6362"} +{"lm loss": 5.06842995, "grad_norm": 0.6954751, "learning_rate": 9.106e-05, "elapsed_time_per_iteration": 6.51574159, "memory(GiB)": 21.51, "elapsed_time": "2h 44m 52s", "remaining_time": "8h 52m 4s", "loss_scale": 1.0, "consumed_samples": 385280, "global_step/max_steps": "1505/6362"} +{"lm loss": 5.07602596, "grad_norm": 0.76056021, "learning_rate": 9.104e-05, "elapsed_time_per_iteration": 6.37661743, "memory(GiB)": 21.51, "elapsed_time": "2h 44m 58s", "remaining_time": "8h 51m 57s", "loss_scale": 1.0, "consumed_samples": 385536, "global_step/max_steps": "1506/6362"} +{"lm loss": 5.07437992, "grad_norm": 0.91695786, "learning_rate": 9.103e-05, "elapsed_time_per_iteration": 6.63035631, "memory(GiB)": 21.51, "elapsed_time": "2h 45m 5s", "remaining_time": "8h 51m 50s", "loss_scale": 1.0, "consumed_samples": 385792, "global_step/max_steps": "1507/6362"} +{"lm loss": 5.09005308, "grad_norm": 1.03931117, "learning_rate": 9.102e-05, "elapsed_time_per_iteration": 6.40973139, "memory(GiB)": 21.51, "elapsed_time": "2h 45m 11s", "remaining_time": "8h 51m 43s", "loss_scale": 1.0, "consumed_samples": 386048, "global_step/max_steps": "1508/6362"} +{"lm loss": 5.08284378, "grad_norm": 0.91357303, "learning_rate": 9.1e-05, "elapsed_time_per_iteration": 6.63342166, "memory(GiB)": 21.51, "elapsed_time": "2h 45m 18s", "remaining_time": "8h 51m 37s", "loss_scale": 1.0, "consumed_samples": 386304, "global_step/max_steps": "1509/6362"} +{"lm loss": 5.09258938, "grad_norm": 0.9000411, "learning_rate": 9.099e-05, "elapsed_time_per_iteration": 6.29706264, "memory(GiB)": 21.51, "elapsed_time": "2h 45m 24s", "remaining_time": "8h 51m 29s", "loss_scale": 1.0, "consumed_samples": 386560, "global_step/max_steps": "1510/6362"} +{"lm loss": 5.09881306, "grad_norm": 1.09012461, "learning_rate": 9.097e-05, "elapsed_time_per_iteration": 6.57084298, "memory(GiB)": 21.51, "elapsed_time": "2h 45m 31s", "remaining_time": "8h 51m 23s", "loss_scale": 1.0, "consumed_samples": 386816, "global_step/max_steps": "1511/6362"} +{"lm loss": 5.09669828, "grad_norm": 0.89790666, "learning_rate": 9.096e-05, "elapsed_time_per_iteration": 6.52837181, "memory(GiB)": 21.51, "elapsed_time": "2h 45m 37s", "remaining_time": "8h 51m 16s", "loss_scale": 1.0, "consumed_samples": 387072, "global_step/max_steps": "1512/6362"} +{"lm loss": 5.07148266, "grad_norm": 0.79575539, "learning_rate": 9.094e-05, "elapsed_time_per_iteration": 6.49604702, "memory(GiB)": 21.51, "elapsed_time": "2h 45m 44s", "remaining_time": "8h 51m 9s", "loss_scale": 1.0, "consumed_samples": 387328, "global_step/max_steps": "1513/6362"} +{"lm loss": 5.06885481, "grad_norm": 0.71405286, "learning_rate": 9.093e-05, "elapsed_time_per_iteration": 6.52911186, "memory(GiB)": 21.51, "elapsed_time": "2h 45m 50s", "remaining_time": "8h 51m 2s", "loss_scale": 1.0, "consumed_samples": 387584, "global_step/max_steps": "1514/6362"} +{"lm loss": 5.08062172, "grad_norm": 0.66046137, "learning_rate": 9.091e-05, "elapsed_time_per_iteration": 6.64871073, "memory(GiB)": 21.51, "elapsed_time": "2h 45m 57s", "remaining_time": "8h 50m 56s", "loss_scale": 1.0, "consumed_samples": 387840, "global_step/max_steps": "1515/6362"} +{"lm loss": 5.08384895, "grad_norm": 0.75514543, "learning_rate": 9.09e-05, "elapsed_time_per_iteration": 6.53876925, "memory(GiB)": 21.51, "elapsed_time": "2h 46m 3s", "remaining_time": "8h 50m 49s", "loss_scale": 1.0, "consumed_samples": 388096, "global_step/max_steps": "1516/6362"} +{"lm loss": 5.07085705, "grad_norm": 0.7949791, "learning_rate": 9.088e-05, "elapsed_time_per_iteration": 6.48193121, "memory(GiB)": 21.51, "elapsed_time": "2h 46m 10s", "remaining_time": "8h 50m 43s", "loss_scale": 1.0, "consumed_samples": 388352, "global_step/max_steps": "1517/6362"} +{"lm loss": 5.07985353, "grad_norm": 0.84277028, "learning_rate": 9.087e-05, "elapsed_time_per_iteration": 6.36138797, "memory(GiB)": 21.51, "elapsed_time": "2h 46m 16s", "remaining_time": "8h 50m 35s", "loss_scale": 1.0, "consumed_samples": 388608, "global_step/max_steps": "1518/6362"} +{"lm loss": 5.06474876, "grad_norm": 0.8395918, "learning_rate": 9.085e-05, "elapsed_time_per_iteration": 6.55986643, "memory(GiB)": 21.51, "elapsed_time": "2h 46m 23s", "remaining_time": "8h 50m 29s", "loss_scale": 1.0, "consumed_samples": 388864, "global_step/max_steps": "1519/6362"} +{"lm loss": 5.09485817, "grad_norm": 0.73417747, "learning_rate": 9.084e-05, "elapsed_time_per_iteration": 6.53743386, "memory(GiB)": 21.51, "elapsed_time": "2h 46m 29s", "remaining_time": "8h 50m 22s", "loss_scale": 1.0, "consumed_samples": 389120, "global_step/max_steps": "1520/6362"} +{"lm loss": 5.09782696, "grad_norm": 0.7400068, "learning_rate": 9.082e-05, "elapsed_time_per_iteration": 6.63241386, "memory(GiB)": 21.51, "elapsed_time": "2h 46m 36s", "remaining_time": "8h 50m 16s", "loss_scale": 1.0, "consumed_samples": 389376, "global_step/max_steps": "1521/6362"} +{"lm loss": 5.10217571, "grad_norm": 0.79708713, "learning_rate": 9.081e-05, "elapsed_time_per_iteration": 6.33386993, "memory(GiB)": 21.51, "elapsed_time": "2h 46m 42s", "remaining_time": "8h 50m 8s", "loss_scale": 1.0, "consumed_samples": 389632, "global_step/max_steps": "1522/6362"} +{"lm loss": 5.07446623, "grad_norm": 0.72508538, "learning_rate": 9.079e-05, "elapsed_time_per_iteration": 6.62458253, "memory(GiB)": 21.51, "elapsed_time": "2h 46m 49s", "remaining_time": "8h 50m 2s", "loss_scale": 1.0, "consumed_samples": 389888, "global_step/max_steps": "1523/6362"} +{"lm loss": 5.07175303, "grad_norm": 0.73855501, "learning_rate": 9.078e-05, "elapsed_time_per_iteration": 6.53088188, "memory(GiB)": 21.51, "elapsed_time": "2h 46m 55s", "remaining_time": "8h 49m 55s", "loss_scale": 1.0, "consumed_samples": 390144, "global_step/max_steps": "1524/6362"} +{"lm loss": 5.06767654, "grad_norm": 0.87868965, "learning_rate": 9.077e-05, "elapsed_time_per_iteration": 6.51174927, "memory(GiB)": 21.51, "elapsed_time": "2h 47m 2s", "remaining_time": "8h 49m 48s", "loss_scale": 1.0, "consumed_samples": 390400, "global_step/max_steps": "1525/6362"} +{"lm loss": 5.08225298, "grad_norm": 0.88738745, "learning_rate": 9.075e-05, "elapsed_time_per_iteration": 6.32260537, "memory(GiB)": 21.51, "elapsed_time": "2h 47m 8s", "remaining_time": "8h 49m 41s", "loss_scale": 1.0, "consumed_samples": 390656, "global_step/max_steps": "1526/6362"} +{"lm loss": 5.07930613, "grad_norm": 0.91149485, "learning_rate": 9.074e-05, "elapsed_time_per_iteration": 6.73439145, "memory(GiB)": 21.51, "elapsed_time": "2h 47m 15s", "remaining_time": "8h 49m 35s", "loss_scale": 1.0, "consumed_samples": 390912, "global_step/max_steps": "1527/6362"} +{"lm loss": 5.09516907, "grad_norm": 0.82116282, "learning_rate": 9.072e-05, "elapsed_time_per_iteration": 6.51331687, "memory(GiB)": 21.51, "elapsed_time": "2h 47m 21s", "remaining_time": "8h 49m 28s", "loss_scale": 1.0, "consumed_samples": 391168, "global_step/max_steps": "1528/6362"} +{"lm loss": 5.07804251, "grad_norm": 0.83793634, "learning_rate": 9.071e-05, "elapsed_time_per_iteration": 6.68705106, "memory(GiB)": 21.51, "elapsed_time": "2h 47m 28s", "remaining_time": "8h 49m 22s", "loss_scale": 1.0, "consumed_samples": 391424, "global_step/max_steps": "1529/6362"} +{"lm loss": 5.10002661, "grad_norm": 1.01822817, "learning_rate": 9.069e-05, "elapsed_time_per_iteration": 6.46519566, "memory(GiB)": 21.51, "elapsed_time": "2h 47m 35s", "remaining_time": "8h 49m 15s", "loss_scale": 1.0, "consumed_samples": 391680, "global_step/max_steps": "1530/6362"} +{"lm loss": 5.07766819, "grad_norm": 1.16561484, "learning_rate": 9.068e-05, "elapsed_time_per_iteration": 6.50221276, "memory(GiB)": 21.51, "elapsed_time": "2h 47m 41s", "remaining_time": "8h 49m 8s", "loss_scale": 1.0, "consumed_samples": 391936, "global_step/max_steps": "1531/6362"} +{"lm loss": 5.07695484, "grad_norm": 0.8938055, "learning_rate": 9.066e-05, "elapsed_time_per_iteration": 6.51445603, "memory(GiB)": 21.51, "elapsed_time": "2h 47m 48s", "remaining_time": "8h 49m 2s", "loss_scale": 1.0, "consumed_samples": 392192, "global_step/max_steps": "1532/6362"} +{"lm loss": 5.08627796, "grad_norm": 0.83946341, "learning_rate": 9.065e-05, "elapsed_time_per_iteration": 6.32339454, "memory(GiB)": 21.51, "elapsed_time": "2h 47m 54s", "remaining_time": "8h 48m 54s", "loss_scale": 1.0, "consumed_samples": 392448, "global_step/max_steps": "1533/6362"} +{"lm loss": 5.08205223, "grad_norm": 0.67295349, "learning_rate": 9.063e-05, "elapsed_time_per_iteration": 6.38681793, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 0s", "remaining_time": "8h 48m 47s", "loss_scale": 1.0, "consumed_samples": 392704, "global_step/max_steps": "1534/6362"} +{"lm loss": 5.11087465, "grad_norm": 0.73468095, "learning_rate": 9.062e-05, "elapsed_time_per_iteration": 6.51975965, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 7s", "remaining_time": "8h 48m 40s", "loss_scale": 1.0, "consumed_samples": 392960, "global_step/max_steps": "1535/6362"} +{"lm loss": 5.09978914, "grad_norm": 0.82107812, "learning_rate": 9.06e-05, "elapsed_time_per_iteration": 6.53318667, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 13s", "remaining_time": "8h 48m 34s", "loss_scale": 1.0, "consumed_samples": 393216, "global_step/max_steps": "1536/6362"} +{"lm loss": 5.07511663, "grad_norm": 0.891047, "learning_rate": 9.059e-05, "elapsed_time_per_iteration": 6.77105498, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 20s", "remaining_time": "8h 48m 28s", "loss_scale": 1.0, "consumed_samples": 393472, "global_step/max_steps": "1537/6362"} +{"lm loss": 5.0771637, "grad_norm": 0.79945385, "learning_rate": 9.057e-05, "elapsed_time_per_iteration": 6.60109234, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 27s", "remaining_time": "8h 48m 21s", "loss_scale": 1.0, "consumed_samples": 393728, "global_step/max_steps": "1538/6362"} +{"lm loss": 5.08253765, "grad_norm": 0.74526596, "learning_rate": 9.056e-05, "elapsed_time_per_iteration": 6.49054956, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 33s", "remaining_time": "8h 48m 14s", "loss_scale": 1.0, "consumed_samples": 393984, "global_step/max_steps": "1539/6362"} +{"lm loss": 5.10102892, "grad_norm": 0.75075901, "learning_rate": 9.054e-05, "elapsed_time_per_iteration": 6.47062063, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 40s", "remaining_time": "8h 48m 8s", "loss_scale": 1.0, "consumed_samples": 394240, "global_step/max_steps": "1540/6362"} +{"lm loss": 5.08276987, "grad_norm": 0.75387877, "learning_rate": 9.053e-05, "elapsed_time_per_iteration": 6.43147588, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 46s", "remaining_time": "8h 48m 1s", "loss_scale": 1.0, "consumed_samples": 394496, "global_step/max_steps": "1541/6362"} +{"lm loss": 5.0829196, "grad_norm": 0.7011627, "learning_rate": 9.051e-05, "elapsed_time_per_iteration": 6.46508121, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 53s", "remaining_time": "8h 47m 54s", "loss_scale": 1.0, "consumed_samples": 394752, "global_step/max_steps": "1542/6362"} +{"lm loss": 5.07000351, "grad_norm": 0.70785296, "learning_rate": 9.05e-05, "elapsed_time_per_iteration": 6.65261841, "memory(GiB)": 21.51, "elapsed_time": "2h 48m 59s", "remaining_time": "8h 47m 47s", "loss_scale": 1.0, "consumed_samples": 395008, "global_step/max_steps": "1543/6362"} +{"lm loss": 5.07154369, "grad_norm": 0.72088891, "learning_rate": 9.048e-05, "elapsed_time_per_iteration": 6.66472745, "memory(GiB)": 21.51, "elapsed_time": "2h 49m 6s", "remaining_time": "8h 47m 41s", "loss_scale": 1.0, "consumed_samples": 395264, "global_step/max_steps": "1544/6362"} +{"lm loss": 5.07147789, "grad_norm": 0.66970086, "learning_rate": 9.047e-05, "elapsed_time_per_iteration": 6.60710716, "memory(GiB)": 21.51, "elapsed_time": "2h 49m 13s", "remaining_time": "8h 47m 35s", "loss_scale": 1.0, "consumed_samples": 395520, "global_step/max_steps": "1545/6362"} +{"lm loss": 5.09416485, "grad_norm": 0.67283362, "learning_rate": 9.045e-05, "elapsed_time_per_iteration": 6.63926315, "memory(GiB)": 21.51, "elapsed_time": "2h 49m 19s", "remaining_time": "8h 47m 28s", "loss_scale": 1.0, "consumed_samples": 395776, "global_step/max_steps": "1546/6362"} +{"lm loss": 5.06529427, "grad_norm": 0.81676996, "learning_rate": 9.044e-05, "elapsed_time_per_iteration": 6.50849271, "memory(GiB)": 21.51, "elapsed_time": "2h 49m 26s", "remaining_time": "8h 47m 21s", "loss_scale": 1.0, "consumed_samples": 396032, "global_step/max_steps": "1547/6362"} +{"lm loss": 5.07450533, "grad_norm": 0.99339283, "learning_rate": 9.042e-05, "elapsed_time_per_iteration": 6.5872364, "memory(GiB)": 21.51, "elapsed_time": "2h 49m 32s", "remaining_time": "8h 47m 15s", "loss_scale": 1.0, "consumed_samples": 396288, "global_step/max_steps": "1548/6362"} +{"lm loss": 5.09517765, "grad_norm": 1.09593546, "learning_rate": 9.041e-05, "elapsed_time_per_iteration": 6.4784286, "memory(GiB)": 21.51, "elapsed_time": "2h 49m 39s", "remaining_time": "8h 47m 8s", "loss_scale": 1.0, "consumed_samples": 396544, "global_step/max_steps": "1549/6362"} +{"lm loss": 5.08320475, "grad_norm": 1.0662843, "learning_rate": 9.039e-05, "elapsed_time_per_iteration": 6.55407691, "memory(GiB)": 21.51, "elapsed_time": "2h 49m 45s", "remaining_time": "8h 47m 1s", "loss_scale": 1.0, "consumed_samples": 396800, "global_step/max_steps": "1550/6362"} +{"lm loss": 5.06433058, "grad_norm": 1.08736503, "learning_rate": 9.038e-05, "elapsed_time_per_iteration": 6.52195334, "memory(GiB)": 21.51, "elapsed_time": "2h 49m 52s", "remaining_time": "8h 46m 55s", "loss_scale": 1.0, "consumed_samples": 397056, "global_step/max_steps": "1551/6362"} +{"lm loss": 5.06886768, "grad_norm": 0.83689368, "learning_rate": 9.036e-05, "elapsed_time_per_iteration": 6.4964273, "memory(GiB)": 21.51, "elapsed_time": "2h 49m 58s", "remaining_time": "8h 46m 48s", "loss_scale": 1.0, "consumed_samples": 397312, "global_step/max_steps": "1552/6362"} +{"lm loss": 5.08844423, "grad_norm": 0.79877603, "learning_rate": 9.035e-05, "elapsed_time_per_iteration": 6.5789268, "memory(GiB)": 21.51, "elapsed_time": "2h 50m 5s", "remaining_time": "8h 46m 41s", "loss_scale": 1.0, "consumed_samples": 397568, "global_step/max_steps": "1553/6362"} +{"lm loss": 5.08633852, "grad_norm": 0.877505, "learning_rate": 9.033e-05, "elapsed_time_per_iteration": 6.75355935, "memory(GiB)": 21.51, "elapsed_time": "2h 50m 12s", "remaining_time": "8h 46m 35s", "loss_scale": 1.0, "consumed_samples": 397824, "global_step/max_steps": "1554/6362"} +{"lm loss": 5.0771451, "grad_norm": 0.87873578, "learning_rate": 9.032e-05, "elapsed_time_per_iteration": 6.58719087, "memory(GiB)": 21.51, "elapsed_time": "2h 50m 18s", "remaining_time": "8h 46m 29s", "loss_scale": 1.0, "consumed_samples": 398080, "global_step/max_steps": "1555/6362"} +{"lm loss": 5.08489275, "grad_norm": 0.7608552, "learning_rate": 9.03e-05, "elapsed_time_per_iteration": 6.68453836, "memory(GiB)": 21.51, "elapsed_time": "2h 50m 25s", "remaining_time": "8h 46m 23s", "loss_scale": 1.0, "consumed_samples": 398336, "global_step/max_steps": "1556/6362"} +{"lm loss": 5.09669304, "grad_norm": 0.71932256, "learning_rate": 9.029e-05, "elapsed_time_per_iteration": 6.51092768, "memory(GiB)": 21.51, "elapsed_time": "2h 50m 31s", "remaining_time": "8h 46m 16s", "loss_scale": 1.0, "consumed_samples": 398592, "global_step/max_steps": "1557/6362"} +{"lm loss": 5.10271597, "grad_norm": 0.73132652, "learning_rate": 9.027e-05, "elapsed_time_per_iteration": 6.56578374, "memory(GiB)": 21.51, "elapsed_time": "2h 50m 38s", "remaining_time": "8h 46m 9s", "loss_scale": 1.0, "consumed_samples": 398848, "global_step/max_steps": "1558/6362"} +{"lm loss": 5.08840704, "grad_norm": 0.62190974, "learning_rate": 9.026e-05, "elapsed_time_per_iteration": 6.64903855, "memory(GiB)": 21.51, "elapsed_time": "2h 50m 45s", "remaining_time": "8h 46m 3s", "loss_scale": 1.0, "consumed_samples": 399104, "global_step/max_steps": "1559/6362"} +{"lm loss": 5.07788849, "grad_norm": 0.66787213, "learning_rate": 9.024e-05, "elapsed_time_per_iteration": 6.42225623, "memory(GiB)": 21.51, "elapsed_time": "2h 50m 51s", "remaining_time": "8h 45m 56s", "loss_scale": 1.0, "consumed_samples": 399360, "global_step/max_steps": "1560/6362"} +{"lm loss": 5.05982113, "grad_norm": 0.78677016, "learning_rate": 9.023e-05, "elapsed_time_per_iteration": 6.72380781, "memory(GiB)": 21.51, "elapsed_time": "2h 50m 58s", "remaining_time": "8h 45m 50s", "loss_scale": 1.0, "consumed_samples": 399616, "global_step/max_steps": "1561/6362"} +{"lm loss": 5.06241179, "grad_norm": 0.88948494, "learning_rate": 9.021e-05, "elapsed_time_per_iteration": 6.4051981, "memory(GiB)": 21.51, "elapsed_time": "2h 51m 4s", "remaining_time": "8h 45m 43s", "loss_scale": 1.0, "consumed_samples": 399872, "global_step/max_steps": "1562/6362"} +{"lm loss": 5.08020973, "grad_norm": 0.82002795, "learning_rate": 9.02e-05, "elapsed_time_per_iteration": 6.4914453, "memory(GiB)": 21.51, "elapsed_time": "2h 51m 11s", "remaining_time": "8h 45m 36s", "loss_scale": 1.0, "consumed_samples": 400128, "global_step/max_steps": "1563/6362"} +{"lm loss": 5.09430313, "grad_norm": 0.71428812, "learning_rate": 9.018e-05, "elapsed_time_per_iteration": 6.50136518, "memory(GiB)": 21.51, "elapsed_time": "2h 51m 17s", "remaining_time": "8h 45m 29s", "loss_scale": 1.0, "consumed_samples": 400384, "global_step/max_steps": "1564/6362"} +{"lm loss": 5.10410166, "grad_norm": 0.77963078, "learning_rate": 9.016e-05, "elapsed_time_per_iteration": 6.57771254, "memory(GiB)": 21.51, "elapsed_time": "2h 51m 24s", "remaining_time": "8h 45m 23s", "loss_scale": 1.0, "consumed_samples": 400640, "global_step/max_steps": "1565/6362"} +{"lm loss": 5.09504032, "grad_norm": 0.89322937, "learning_rate": 9.015e-05, "elapsed_time_per_iteration": 6.34942555, "memory(GiB)": 21.51, "elapsed_time": "2h 51m 30s", "remaining_time": "8h 45m 15s", "loss_scale": 1.0, "consumed_samples": 400896, "global_step/max_steps": "1566/6362"} +{"lm loss": 5.09121084, "grad_norm": 1.07074523, "learning_rate": 9.013e-05, "elapsed_time_per_iteration": 6.49477768, "memory(GiB)": 21.51, "elapsed_time": "2h 51m 37s", "remaining_time": "8h 45m 8s", "loss_scale": 1.0, "consumed_samples": 401152, "global_step/max_steps": "1567/6362"} +{"lm loss": 5.07058573, "grad_norm": 0.87741411, "learning_rate": 9.012e-05, "elapsed_time_per_iteration": 6.60052586, "memory(GiB)": 21.51, "elapsed_time": "2h 51m 43s", "remaining_time": "8h 45m 2s", "loss_scale": 1.0, "consumed_samples": 401408, "global_step/max_steps": "1568/6362"} +{"lm loss": 5.08019972, "grad_norm": 0.66513735, "learning_rate": 9.01e-05, "elapsed_time_per_iteration": 6.69353294, "memory(GiB)": 21.51, "elapsed_time": "2h 51m 50s", "remaining_time": "8h 44m 56s", "loss_scale": 1.0, "consumed_samples": 401664, "global_step/max_steps": "1569/6362"} +{"lm loss": 5.07354689, "grad_norm": 0.78559929, "learning_rate": 9.009e-05, "elapsed_time_per_iteration": 6.50109243, "memory(GiB)": 21.51, "elapsed_time": "2h 51m 56s", "remaining_time": "8h 44m 49s", "loss_scale": 1.0, "consumed_samples": 401920, "global_step/max_steps": "1570/6362"} +{"lm loss": 5.05769968, "grad_norm": 0.80084234, "learning_rate": 9.007e-05, "elapsed_time_per_iteration": 6.56855345, "memory(GiB)": 21.51, "elapsed_time": "2h 52m 3s", "remaining_time": "8h 44m 42s", "loss_scale": 1.0, "consumed_samples": 402176, "global_step/max_steps": "1571/6362"} +{"lm loss": 5.06804037, "grad_norm": 0.974756, "learning_rate": 9.006e-05, "elapsed_time_per_iteration": 6.66365409, "memory(GiB)": 21.51, "elapsed_time": "2h 52m 10s", "remaining_time": "8h 44m 36s", "loss_scale": 1.0, "consumed_samples": 402432, "global_step/max_steps": "1572/6362"} +{"lm loss": 5.08853292, "grad_norm": 1.1117053, "learning_rate": 9.004e-05, "elapsed_time_per_iteration": 6.58171463, "memory(GiB)": 21.51, "elapsed_time": "2h 52m 16s", "remaining_time": "8h 44m 30s", "loss_scale": 1.0, "consumed_samples": 402688, "global_step/max_steps": "1573/6362"} +{"lm loss": 5.07495499, "grad_norm": 0.88993728, "learning_rate": 9.003e-05, "elapsed_time_per_iteration": 6.48729396, "memory(GiB)": 21.51, "elapsed_time": "2h 52m 23s", "remaining_time": "8h 44m 23s", "loss_scale": 1.0, "consumed_samples": 402944, "global_step/max_steps": "1574/6362"} +{"lm loss": 5.07794285, "grad_norm": 0.90041625, "learning_rate": 9.001e-05, "elapsed_time_per_iteration": 6.52695465, "memory(GiB)": 21.51, "elapsed_time": "2h 52m 29s", "remaining_time": "8h 44m 16s", "loss_scale": 1.0, "consumed_samples": 403200, "global_step/max_steps": "1575/6362"} +{"lm loss": 5.1026125, "grad_norm": 0.86674368, "learning_rate": 9e-05, "elapsed_time_per_iteration": 6.57369113, "memory(GiB)": 21.51, "elapsed_time": "2h 52m 36s", "remaining_time": "8h 44m 10s", "loss_scale": 1.0, "consumed_samples": 403456, "global_step/max_steps": "1576/6362"} +{"lm loss": 5.07986975, "grad_norm": 0.85299069, "learning_rate": 8.998e-05, "elapsed_time_per_iteration": 6.52078152, "memory(GiB)": 21.51, "elapsed_time": "2h 52m 42s", "remaining_time": "8h 44m 3s", "loss_scale": 1.0, "consumed_samples": 403712, "global_step/max_steps": "1577/6362"} +{"lm loss": 5.09738016, "grad_norm": 0.84510201, "learning_rate": 8.997e-05, "elapsed_time_per_iteration": 6.64834714, "memory(GiB)": 21.51, "elapsed_time": "2h 52m 49s", "remaining_time": "8h 43m 56s", "loss_scale": 1.0, "consumed_samples": 403968, "global_step/max_steps": "1578/6362"} +{"lm loss": 5.08457661, "grad_norm": 0.93467277, "learning_rate": 8.995e-05, "elapsed_time_per_iteration": 6.48103237, "memory(GiB)": 21.51, "elapsed_time": "2h 52m 55s", "remaining_time": "8h 43m 50s", "loss_scale": 1.0, "consumed_samples": 404224, "global_step/max_steps": "1579/6362"} +{"lm loss": 5.07577896, "grad_norm": 0.90808165, "learning_rate": 8.994e-05, "elapsed_time_per_iteration": 6.57405257, "memory(GiB)": 21.51, "elapsed_time": "2h 53m 2s", "remaining_time": "8h 43m 43s", "loss_scale": 1.0, "consumed_samples": 404480, "global_step/max_steps": "1580/6362"} +{"lm loss": 5.09114075, "grad_norm": 0.8797031, "learning_rate": 8.992e-05, "elapsed_time_per_iteration": 6.45874333, "memory(GiB)": 21.51, "elapsed_time": "2h 53m 8s", "remaining_time": "8h 43m 36s", "loss_scale": 1.0, "consumed_samples": 404736, "global_step/max_steps": "1581/6362"} +{"lm loss": 5.06500959, "grad_norm": 0.75034237, "learning_rate": 8.99e-05, "elapsed_time_per_iteration": 6.60465741, "memory(GiB)": 21.51, "elapsed_time": "2h 53m 15s", "remaining_time": "8h 43m 30s", "loss_scale": 1.0, "consumed_samples": 404992, "global_step/max_steps": "1582/6362"} +{"lm loss": 5.07307196, "grad_norm": 0.70713729, "learning_rate": 8.989e-05, "elapsed_time_per_iteration": 6.59012771, "memory(GiB)": 21.51, "elapsed_time": "2h 53m 22s", "remaining_time": "8h 43m 23s", "loss_scale": 1.0, "consumed_samples": 405248, "global_step/max_steps": "1583/6362"} +{"lm loss": 5.09095621, "grad_norm": 0.85014617, "learning_rate": 8.987e-05, "elapsed_time_per_iteration": 6.66933441, "memory(GiB)": 21.51, "elapsed_time": "2h 53m 28s", "remaining_time": "8h 43m 17s", "loss_scale": 1.0, "consumed_samples": 405504, "global_step/max_steps": "1584/6362"} +{"lm loss": 5.07863188, "grad_norm": 0.80865943, "learning_rate": 8.986e-05, "elapsed_time_per_iteration": 6.48525381, "memory(GiB)": 21.51, "elapsed_time": "2h 53m 35s", "remaining_time": "8h 43m 10s", "loss_scale": 1.0, "consumed_samples": 405760, "global_step/max_steps": "1585/6362"} +{"lm loss": 5.084167, "grad_norm": 0.66674221, "learning_rate": 8.984e-05, "elapsed_time_per_iteration": 6.54303002, "memory(GiB)": 21.51, "elapsed_time": "2h 53m 41s", "remaining_time": "8h 43m 3s", "loss_scale": 1.0, "consumed_samples": 406016, "global_step/max_steps": "1586/6362"} +{"lm loss": 5.06007195, "grad_norm": 0.6760326, "learning_rate": 8.983e-05, "elapsed_time_per_iteration": 6.51464224, "memory(GiB)": 21.51, "elapsed_time": "2h 53m 48s", "remaining_time": "8h 42m 57s", "loss_scale": 1.0, "consumed_samples": 406272, "global_step/max_steps": "1587/6362"} +{"lm loss": 5.07351971, "grad_norm": 0.75153363, "learning_rate": 8.981e-05, "elapsed_time_per_iteration": 6.71028233, "memory(GiB)": 21.51, "elapsed_time": "2h 53m 55s", "remaining_time": "8h 42m 50s", "loss_scale": 1.0, "consumed_samples": 406528, "global_step/max_steps": "1588/6362"} +{"lm loss": 5.06326628, "grad_norm": 0.58038795, "learning_rate": 8.98e-05, "elapsed_time_per_iteration": 6.42330885, "memory(GiB)": 21.51, "elapsed_time": "2h 54m 1s", "remaining_time": "8h 42m 43s", "loss_scale": 1.0, "consumed_samples": 406784, "global_step/max_steps": "1589/6362"} +{"lm loss": 5.06696844, "grad_norm": 0.74047679, "learning_rate": 8.978e-05, "elapsed_time_per_iteration": 6.72426558, "memory(GiB)": 21.51, "elapsed_time": "2h 54m 8s", "remaining_time": "8h 42m 37s", "loss_scale": 1.0, "consumed_samples": 407040, "global_step/max_steps": "1590/6362"} +{"lm loss": 5.06147909, "grad_norm": 0.72817039, "learning_rate": 8.977e-05, "elapsed_time_per_iteration": 6.59742451, "memory(GiB)": 21.51, "elapsed_time": "2h 54m 14s", "remaining_time": "8h 42m 31s", "loss_scale": 1.0, "consumed_samples": 407296, "global_step/max_steps": "1591/6362"} +{"lm loss": 5.05465221, "grad_norm": 0.75015867, "learning_rate": 8.975e-05, "elapsed_time_per_iteration": 6.60383773, "memory(GiB)": 21.51, "elapsed_time": "2h 54m 21s", "remaining_time": "8h 42m 24s", "loss_scale": 1.0, "consumed_samples": 407552, "global_step/max_steps": "1592/6362"} +{"lm loss": 5.06382942, "grad_norm": 0.74838603, "learning_rate": 8.973e-05, "elapsed_time_per_iteration": 6.43522906, "memory(GiB)": 21.51, "elapsed_time": "2h 54m 27s", "remaining_time": "8h 42m 17s", "loss_scale": 1.0, "consumed_samples": 407808, "global_step/max_steps": "1593/6362"} +{"lm loss": 5.07706547, "grad_norm": 0.87198514, "learning_rate": 8.972e-05, "elapsed_time_per_iteration": 6.39239907, "memory(GiB)": 21.51, "elapsed_time": "2h 54m 34s", "remaining_time": "8h 42m 10s", "loss_scale": 1.0, "consumed_samples": 408064, "global_step/max_steps": "1594/6362"} +{"lm loss": 5.06457567, "grad_norm": 0.83326072, "learning_rate": 8.97e-05, "elapsed_time_per_iteration": 6.57911754, "memory(GiB)": 21.51, "elapsed_time": "2h 54m 40s", "remaining_time": "8h 42m 4s", "loss_scale": 1.0, "consumed_samples": 408320, "global_step/max_steps": "1595/6362"} +{"lm loss": 5.09533978, "grad_norm": 0.79242134, "learning_rate": 8.969e-05, "elapsed_time_per_iteration": 6.42245889, "memory(GiB)": 21.51, "elapsed_time": "2h 54m 47s", "remaining_time": "8h 41m 57s", "loss_scale": 1.0, "consumed_samples": 408576, "global_step/max_steps": "1596/6362"} +{"lm loss": 5.06723213, "grad_norm": 0.72792476, "learning_rate": 8.967e-05, "elapsed_time_per_iteration": 6.66757011, "memory(GiB)": 21.51, "elapsed_time": "2h 54m 53s", "remaining_time": "8h 41m 50s", "loss_scale": 1.0, "consumed_samples": 408832, "global_step/max_steps": "1597/6362"} +{"lm loss": 5.06427145, "grad_norm": 0.61371815, "learning_rate": 8.966e-05, "elapsed_time_per_iteration": 6.51001048, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 0s", "remaining_time": "8h 41m 44s", "loss_scale": 1.0, "consumed_samples": 409088, "global_step/max_steps": "1598/6362"} +{"lm loss": 5.0740099, "grad_norm": 0.64523709, "learning_rate": 8.964e-05, "elapsed_time_per_iteration": 6.52438068, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 6s", "remaining_time": "8h 41m 37s", "loss_scale": 1.0, "consumed_samples": 409344, "global_step/max_steps": "1599/6362"} +{"lm loss": 5.05252409, "grad_norm": 0.66990775, "learning_rate": 8.963e-05, "elapsed_time_per_iteration": 6.71858454, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 13s", "remaining_time": "8h 41m 31s", "loss_scale": 1.0, "consumed_samples": 409600, "global_step/max_steps": "1600/6362"} +{"lm loss": 5.06150198, "grad_norm": 0.65261871, "learning_rate": 8.961e-05, "elapsed_time_per_iteration": 6.69528365, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 20s", "remaining_time": "8h 41m 25s", "loss_scale": 1.0, "consumed_samples": 409856, "global_step/max_steps": "1601/6362"} +{"lm loss": 5.06346369, "grad_norm": 0.78300983, "learning_rate": 8.959e-05, "elapsed_time_per_iteration": 6.5502491, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 26s", "remaining_time": "8h 41m 18s", "loss_scale": 1.0, "consumed_samples": 410112, "global_step/max_steps": "1602/6362"} +{"lm loss": 5.06138659, "grad_norm": 0.83315492, "learning_rate": 8.958e-05, "elapsed_time_per_iteration": 6.50294614, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 33s", "remaining_time": "8h 41m 11s", "loss_scale": 1.0, "consumed_samples": 410368, "global_step/max_steps": "1603/6362"} +{"lm loss": 5.07204485, "grad_norm": 0.83058703, "learning_rate": 8.956e-05, "elapsed_time_per_iteration": 6.58300781, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 40s", "remaining_time": "8h 41m 5s", "loss_scale": 1.0, "consumed_samples": 410624, "global_step/max_steps": "1604/6362"} +{"lm loss": 5.06829596, "grad_norm": 0.93958354, "learning_rate": 8.955e-05, "elapsed_time_per_iteration": 6.533427, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 46s", "remaining_time": "8h 40m 58s", "loss_scale": 1.0, "consumed_samples": 410880, "global_step/max_steps": "1605/6362"} +{"lm loss": 5.08525085, "grad_norm": 1.18948638, "learning_rate": 8.953e-05, "elapsed_time_per_iteration": 6.51084566, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 53s", "remaining_time": "8h 40m 51s", "loss_scale": 1.0, "consumed_samples": 411136, "global_step/max_steps": "1606/6362"} +{"lm loss": 5.06678104, "grad_norm": 0.9048202, "learning_rate": 8.952e-05, "elapsed_time_per_iteration": 6.71348882, "memory(GiB)": 21.51, "elapsed_time": "2h 55m 59s", "remaining_time": "8h 40m 45s", "loss_scale": 1.0, "consumed_samples": 411392, "global_step/max_steps": "1607/6362"} +{"lm loss": 5.08015156, "grad_norm": 0.85939503, "learning_rate": 8.95e-05, "elapsed_time_per_iteration": 6.52420044, "memory(GiB)": 21.51, "elapsed_time": "2h 56m 6s", "remaining_time": "8h 40m 38s", "loss_scale": 1.0, "consumed_samples": 411648, "global_step/max_steps": "1608/6362"} +{"lm loss": 5.07369423, "grad_norm": 0.98729628, "learning_rate": 8.949e-05, "elapsed_time_per_iteration": 6.59033608, "memory(GiB)": 21.51, "elapsed_time": "2h 56m 12s", "remaining_time": "8h 40m 32s", "loss_scale": 1.0, "consumed_samples": 411904, "global_step/max_steps": "1609/6362"} +{"lm loss": 5.07683659, "grad_norm": 0.96346319, "learning_rate": 8.947e-05, "elapsed_time_per_iteration": 6.47366571, "memory(GiB)": 21.51, "elapsed_time": "2h 56m 19s", "remaining_time": "8h 40m 25s", "loss_scale": 1.0, "consumed_samples": 412160, "global_step/max_steps": "1610/6362"} +{"lm loss": 5.08334208, "grad_norm": 0.91459084, "learning_rate": 8.945e-05, "elapsed_time_per_iteration": 6.47819686, "memory(GiB)": 21.51, "elapsed_time": "2h 56m 25s", "remaining_time": "8h 40m 18s", "loss_scale": 1.0, "consumed_samples": 412416, "global_step/max_steps": "1611/6362"} +{"lm loss": 5.04139853, "grad_norm": 0.81492019, "learning_rate": 8.944e-05, "elapsed_time_per_iteration": 6.80024004, "memory(GiB)": 21.51, "elapsed_time": "2h 56m 32s", "remaining_time": "8h 40m 12s", "loss_scale": 1.0, "consumed_samples": 412672, "global_step/max_steps": "1612/6362"} +{"lm loss": 5.06101227, "grad_norm": 0.71086818, "learning_rate": 8.942e-05, "elapsed_time_per_iteration": 6.55604792, "memory(GiB)": 21.51, "elapsed_time": "2h 56m 39s", "remaining_time": "8h 40m 6s", "loss_scale": 1.0, "consumed_samples": 412928, "global_step/max_steps": "1613/6362"} +{"lm loss": 5.0756259, "grad_norm": 0.78894269, "learning_rate": 8.941e-05, "elapsed_time_per_iteration": 6.42655683, "memory(GiB)": 21.51, "elapsed_time": "2h 56m 45s", "remaining_time": "8h 39m 59s", "loss_scale": 1.0, "consumed_samples": 413184, "global_step/max_steps": "1614/6362"} +{"lm loss": 5.0821085, "grad_norm": 0.64351606, "learning_rate": 8.939e-05, "elapsed_time_per_iteration": 6.53120279, "memory(GiB)": 21.51, "elapsed_time": "2h 56m 52s", "remaining_time": "8h 39m 52s", "loss_scale": 1.0, "consumed_samples": 413440, "global_step/max_steps": "1615/6362"} +{"lm loss": 5.07689524, "grad_norm": 0.71439672, "learning_rate": 8.938e-05, "elapsed_time_per_iteration": 6.70680928, "memory(GiB)": 21.51, "elapsed_time": "2h 56m 58s", "remaining_time": "8h 39m 46s", "loss_scale": 1.0, "consumed_samples": 413696, "global_step/max_steps": "1616/6362"} +{"lm loss": 5.07997704, "grad_norm": 0.75049013, "learning_rate": 8.936e-05, "elapsed_time_per_iteration": 6.61004353, "memory(GiB)": 21.51, "elapsed_time": "2h 57m 5s", "remaining_time": "8h 39m 39s", "loss_scale": 1.0, "consumed_samples": 413952, "global_step/max_steps": "1617/6362"} +{"lm loss": 5.06544828, "grad_norm": 0.64889544, "learning_rate": 8.934e-05, "elapsed_time_per_iteration": 6.67758965, "memory(GiB)": 21.51, "elapsed_time": "2h 57m 12s", "remaining_time": "8h 39m 33s", "loss_scale": 1.0, "consumed_samples": 414208, "global_step/max_steps": "1618/6362"} +{"lm loss": 5.06203175, "grad_norm": 0.73720235, "learning_rate": 8.933e-05, "elapsed_time_per_iteration": 6.79670405, "memory(GiB)": 21.51, "elapsed_time": "2h 57m 18s", "remaining_time": "8h 39m 27s", "loss_scale": 1.0, "consumed_samples": 414464, "global_step/max_steps": "1619/6362"} +{"lm loss": 5.07523394, "grad_norm": 0.86929631, "learning_rate": 8.931e-05, "elapsed_time_per_iteration": 6.78283024, "memory(GiB)": 21.51, "elapsed_time": "2h 57m 25s", "remaining_time": "8h 39m 21s", "loss_scale": 1.0, "consumed_samples": 414720, "global_step/max_steps": "1620/6362"} +{"lm loss": 5.05517006, "grad_norm": 0.89102978, "learning_rate": 8.93e-05, "elapsed_time_per_iteration": 6.61529922, "memory(GiB)": 21.51, "elapsed_time": "2h 57m 32s", "remaining_time": "8h 39m 15s", "loss_scale": 1.0, "consumed_samples": 414976, "global_step/max_steps": "1621/6362"} +{"lm loss": 5.0777297, "grad_norm": 0.85525763, "learning_rate": 8.928e-05, "elapsed_time_per_iteration": 6.65178943, "memory(GiB)": 21.51, "elapsed_time": "2h 57m 39s", "remaining_time": "8h 39m 8s", "loss_scale": 1.0, "consumed_samples": 415232, "global_step/max_steps": "1622/6362"} +{"lm loss": 5.0730114, "grad_norm": 0.87713325, "learning_rate": 8.926e-05, "elapsed_time_per_iteration": 6.68770218, "memory(GiB)": 21.51, "elapsed_time": "2h 57m 45s", "remaining_time": "8h 39m 2s", "loss_scale": 1.0, "consumed_samples": 415488, "global_step/max_steps": "1623/6362"} +{"lm loss": 5.05575037, "grad_norm": 0.85044348, "learning_rate": 8.925e-05, "elapsed_time_per_iteration": 6.60785055, "memory(GiB)": 21.51, "elapsed_time": "2h 57m 52s", "remaining_time": "8h 38m 56s", "loss_scale": 1.0, "consumed_samples": 415744, "global_step/max_steps": "1624/6362"} +{"lm loss": 5.08385324, "grad_norm": 0.78247088, "learning_rate": 8.923e-05, "elapsed_time_per_iteration": 6.46312165, "memory(GiB)": 21.51, "elapsed_time": "2h 57m 58s", "remaining_time": "8h 38m 49s", "loss_scale": 1.0, "consumed_samples": 416000, "global_step/max_steps": "1625/6362"} +{"lm loss": 5.07123232, "grad_norm": 0.71819258, "learning_rate": 8.922e-05, "elapsed_time_per_iteration": 6.62088823, "memory(GiB)": 21.51, "elapsed_time": "2h 58m 5s", "remaining_time": "8h 38m 42s", "loss_scale": 1.0, "consumed_samples": 416256, "global_step/max_steps": "1626/6362"} +{"lm loss": 5.0715971, "grad_norm": 0.87929863, "learning_rate": 8.92e-05, "elapsed_time_per_iteration": 6.83872199, "memory(GiB)": 21.51, "elapsed_time": "2h 58m 12s", "remaining_time": "8h 38m 37s", "loss_scale": 1.0, "consumed_samples": 416512, "global_step/max_steps": "1627/6362"} +{"lm loss": 5.07125807, "grad_norm": 0.88341635, "learning_rate": 8.919e-05, "elapsed_time_per_iteration": 6.61699843, "memory(GiB)": 21.51, "elapsed_time": "2h 58m 18s", "remaining_time": "8h 38m 30s", "loss_scale": 1.0, "consumed_samples": 416768, "global_step/max_steps": "1628/6362"} +{"lm loss": 5.05658293, "grad_norm": 0.83723074, "learning_rate": 8.917e-05, "elapsed_time_per_iteration": 6.66232538, "memory(GiB)": 21.51, "elapsed_time": "2h 58m 25s", "remaining_time": "8h 38m 24s", "loss_scale": 1.0, "consumed_samples": 417024, "global_step/max_steps": "1629/6362"} +{"lm loss": 5.07384157, "grad_norm": 0.66745061, "learning_rate": 8.915e-05, "elapsed_time_per_iteration": 6.43979478, "memory(GiB)": 21.51, "elapsed_time": "2h 58m 31s", "remaining_time": "8h 38m 17s", "loss_scale": 1.0, "consumed_samples": 417280, "global_step/max_steps": "1630/6362"} +{"lm loss": 5.08041811, "grad_norm": 0.68157411, "learning_rate": 8.914e-05, "elapsed_time_per_iteration": 6.59949493, "memory(GiB)": 21.51, "elapsed_time": "2h 58m 38s", "remaining_time": "8h 38m 10s", "loss_scale": 1.0, "consumed_samples": 417536, "global_step/max_steps": "1631/6362"} +{"lm loss": 5.07811356, "grad_norm": 0.71275502, "learning_rate": 8.912e-05, "elapsed_time_per_iteration": 6.79563808, "memory(GiB)": 21.51, "elapsed_time": "2h 58m 45s", "remaining_time": "8h 38m 5s", "loss_scale": 1.0, "consumed_samples": 417792, "global_step/max_steps": "1632/6362"} +{"lm loss": 5.06263924, "grad_norm": 0.73643243, "learning_rate": 8.911e-05, "elapsed_time_per_iteration": 6.65169811, "memory(GiB)": 21.51, "elapsed_time": "2h 58m 51s", "remaining_time": "8h 37m 58s", "loss_scale": 1.0, "consumed_samples": 418048, "global_step/max_steps": "1633/6362"} +{"lm loss": 5.06605959, "grad_norm": 0.80942118, "learning_rate": 8.909e-05, "elapsed_time_per_iteration": 6.77299976, "memory(GiB)": 21.51, "elapsed_time": "2h 58m 58s", "remaining_time": "8h 37m 52s", "loss_scale": 1.0, "consumed_samples": 418304, "global_step/max_steps": "1634/6362"} +{"lm loss": 5.05178642, "grad_norm": 0.84406561, "learning_rate": 8.907e-05, "elapsed_time_per_iteration": 6.57764649, "memory(GiB)": 21.51, "elapsed_time": "2h 59m 5s", "remaining_time": "8h 37m 46s", "loss_scale": 1.0, "consumed_samples": 418560, "global_step/max_steps": "1635/6362"} +{"lm loss": 5.06283903, "grad_norm": 0.81503516, "learning_rate": 8.906e-05, "elapsed_time_per_iteration": 6.55159616, "memory(GiB)": 21.51, "elapsed_time": "2h 59m 11s", "remaining_time": "8h 37m 39s", "loss_scale": 1.0, "consumed_samples": 418816, "global_step/max_steps": "1636/6362"} +{"lm loss": 5.05045509, "grad_norm": 0.80894494, "learning_rate": 8.904e-05, "elapsed_time_per_iteration": 7.31191611, "memory(GiB)": 21.51, "elapsed_time": "2h 59m 19s", "remaining_time": "8h 37m 35s", "loss_scale": 1.0, "consumed_samples": 419072, "global_step/max_steps": "1637/6362"} +{"lm loss": 5.07496548, "grad_norm": 0.75884944, "learning_rate": 8.903e-05, "elapsed_time_per_iteration": 6.5143733, "memory(GiB)": 21.51, "elapsed_time": "2h 59m 25s", "remaining_time": "8h 37m 28s", "loss_scale": 1.0, "consumed_samples": 419328, "global_step/max_steps": "1638/6362"} +{"lm loss": 5.06707191, "grad_norm": 0.74657202, "learning_rate": 8.901e-05, "elapsed_time_per_iteration": 6.48154235, "memory(GiB)": 21.51, "elapsed_time": "2h 59m 32s", "remaining_time": "8h 37m 21s", "loss_scale": 1.0, "consumed_samples": 419584, "global_step/max_steps": "1639/6362"} +{"lm loss": 5.07455301, "grad_norm": 0.67302388, "learning_rate": 8.899e-05, "elapsed_time_per_iteration": 6.61311102, "memory(GiB)": 21.51, "elapsed_time": "2h 59m 38s", "remaining_time": "8h 37m 15s", "loss_scale": 1.0, "consumed_samples": 419840, "global_step/max_steps": "1640/6362"} +{"lm loss": 5.07543135, "grad_norm": 0.72281557, "learning_rate": 8.898e-05, "elapsed_time_per_iteration": 6.62000227, "memory(GiB)": 21.51, "elapsed_time": "2h 59m 45s", "remaining_time": "8h 37m 8s", "loss_scale": 1.0, "consumed_samples": 420096, "global_step/max_steps": "1641/6362"} +{"lm loss": 5.06690407, "grad_norm": 0.89580941, "learning_rate": 8.896e-05, "elapsed_time_per_iteration": 6.92800117, "memory(GiB)": 21.51, "elapsed_time": "2h 59m 52s", "remaining_time": "8h 37m 3s", "loss_scale": 1.0, "consumed_samples": 420352, "global_step/max_steps": "1642/6362"} +{"lm loss": 5.05681515, "grad_norm": 1.06326413, "learning_rate": 8.895e-05, "elapsed_time_per_iteration": 6.67185616, "memory(GiB)": 21.51, "elapsed_time": "2h 59m 59s", "remaining_time": "8h 36m 56s", "loss_scale": 1.0, "consumed_samples": 420608, "global_step/max_steps": "1643/6362"} +{"lm loss": 5.06812, "grad_norm": 0.94343919, "learning_rate": 8.893e-05, "elapsed_time_per_iteration": 6.72801137, "memory(GiB)": 21.51, "elapsed_time": "3h 0m 5s", "remaining_time": "8h 36m 50s", "loss_scale": 1.0, "consumed_samples": 420864, "global_step/max_steps": "1644/6362"} +{"lm loss": 5.08558083, "grad_norm": 0.77322882, "learning_rate": 8.891e-05, "elapsed_time_per_iteration": 6.68700886, "memory(GiB)": 21.51, "elapsed_time": "3h 0m 12s", "remaining_time": "8h 36m 44s", "loss_scale": 1.0, "consumed_samples": 421120, "global_step/max_steps": "1645/6362"} +{"lm loss": 5.08024931, "grad_norm": 0.80758876, "learning_rate": 8.89e-05, "elapsed_time_per_iteration": 6.76858282, "memory(GiB)": 21.51, "elapsed_time": "3h 0m 19s", "remaining_time": "8h 36m 38s", "loss_scale": 1.0, "consumed_samples": 421376, "global_step/max_steps": "1646/6362"} +{"lm loss": 5.06524849, "grad_norm": 0.84722424, "learning_rate": 8.888e-05, "elapsed_time_per_iteration": 6.43709612, "memory(GiB)": 21.51, "elapsed_time": "3h 0m 25s", "remaining_time": "8h 36m 31s", "loss_scale": 1.0, "consumed_samples": 421632, "global_step/max_steps": "1647/6362"} +{"lm loss": 5.05351734, "grad_norm": 0.81120718, "learning_rate": 8.887e-05, "elapsed_time_per_iteration": 6.73523498, "memory(GiB)": 21.51, "elapsed_time": "3h 0m 32s", "remaining_time": "8h 36m 25s", "loss_scale": 1.0, "consumed_samples": 421888, "global_step/max_steps": "1648/6362"} +{"lm loss": 5.05306387, "grad_norm": 0.71966624, "learning_rate": 8.885e-05, "elapsed_time_per_iteration": 6.51102328, "memory(GiB)": 21.51, "elapsed_time": "3h 0m 38s", "remaining_time": "8h 36m 18s", "loss_scale": 1.0, "consumed_samples": 422144, "global_step/max_steps": "1649/6362"} +{"lm loss": 5.06470346, "grad_norm": 0.62715596, "learning_rate": 8.883e-05, "elapsed_time_per_iteration": 6.71516109, "memory(GiB)": 21.51, "elapsed_time": "3h 0m 45s", "remaining_time": "8h 36m 12s", "loss_scale": 1.0, "consumed_samples": 422400, "global_step/max_steps": "1650/6362"} +{"lm loss": 5.06334543, "grad_norm": 0.79165369, "learning_rate": 8.882e-05, "elapsed_time_per_iteration": 6.53482938, "memory(GiB)": 21.51, "elapsed_time": "3h 0m 52s", "remaining_time": "8h 36m 5s", "loss_scale": 1.0, "consumed_samples": 422656, "global_step/max_steps": "1651/6362"} +{"lm loss": 5.06918287, "grad_norm": 0.7334761, "learning_rate": 8.88e-05, "elapsed_time_per_iteration": 6.66687894, "memory(GiB)": 21.51, "elapsed_time": "3h 0m 58s", "remaining_time": "8h 35m 59s", "loss_scale": 1.0, "consumed_samples": 422912, "global_step/max_steps": "1652/6362"} +{"lm loss": 5.06430054, "grad_norm": 0.81079268, "learning_rate": 8.879e-05, "elapsed_time_per_iteration": 6.86735487, "memory(GiB)": 21.51, "elapsed_time": "3h 1m 5s", "remaining_time": "8h 35m 53s", "loss_scale": 1.0, "consumed_samples": 423168, "global_step/max_steps": "1653/6362"} +{"lm loss": 5.06634188, "grad_norm": 0.77769864, "learning_rate": 8.877e-05, "elapsed_time_per_iteration": 6.70166802, "memory(GiB)": 21.51, "elapsed_time": "3h 1m 12s", "remaining_time": "8h 35m 47s", "loss_scale": 1.0, "consumed_samples": 423424, "global_step/max_steps": "1654/6362"} +{"lm loss": 5.0542078, "grad_norm": 0.57359713, "learning_rate": 8.875e-05, "elapsed_time_per_iteration": 6.52024055, "memory(GiB)": 21.51, "elapsed_time": "3h 1m 18s", "remaining_time": "8h 35m 40s", "loss_scale": 1.0, "consumed_samples": 423680, "global_step/max_steps": "1655/6362"} +{"lm loss": 5.06893587, "grad_norm": 0.72808057, "learning_rate": 8.874e-05, "elapsed_time_per_iteration": 6.59843802, "memory(GiB)": 21.51, "elapsed_time": "3h 1m 25s", "remaining_time": "8h 35m 34s", "loss_scale": 1.0, "consumed_samples": 423936, "global_step/max_steps": "1656/6362"} +{"lm loss": 5.05986118, "grad_norm": 0.75388038, "learning_rate": 8.872e-05, "elapsed_time_per_iteration": 6.47407436, "memory(GiB)": 21.51, "elapsed_time": "3h 1m 31s", "remaining_time": "8h 35m 27s", "loss_scale": 1.0, "consumed_samples": 424192, "global_step/max_steps": "1657/6362"} +{"lm loss": 5.05091095, "grad_norm": 0.83758539, "learning_rate": 8.87e-05, "elapsed_time_per_iteration": 6.54898906, "memory(GiB)": 21.51, "elapsed_time": "3h 1m 38s", "remaining_time": "8h 35m 20s", "loss_scale": 1.0, "consumed_samples": 424448, "global_step/max_steps": "1658/6362"} +{"lm loss": 5.07749224, "grad_norm": 0.87661296, "learning_rate": 8.869e-05, "elapsed_time_per_iteration": 6.57242012, "memory(GiB)": 21.51, "elapsed_time": "3h 1m 45s", "remaining_time": "8h 35m 14s", "loss_scale": 1.0, "consumed_samples": 424704, "global_step/max_steps": "1659/6362"} +{"lm loss": 5.08384418, "grad_norm": 0.81043166, "learning_rate": 8.867e-05, "elapsed_time_per_iteration": 6.43165541, "memory(GiB)": 21.51, "elapsed_time": "3h 1m 51s", "remaining_time": "8h 35m 7s", "loss_scale": 1.0, "consumed_samples": 424960, "global_step/max_steps": "1660/6362"} +{"lm loss": 5.06143332, "grad_norm": 0.90634263, "learning_rate": 8.866e-05, "elapsed_time_per_iteration": 6.52359676, "memory(GiB)": 21.51, "elapsed_time": "3h 1m 58s", "remaining_time": "8h 35m 0s", "loss_scale": 1.0, "consumed_samples": 425216, "global_step/max_steps": "1661/6362"} +{"lm loss": 5.06125212, "grad_norm": 1.13561344, "learning_rate": 8.864e-05, "elapsed_time_per_iteration": 6.65881634, "memory(GiB)": 21.51, "elapsed_time": "3h 2m 4s", "remaining_time": "8h 34m 54s", "loss_scale": 1.0, "consumed_samples": 425472, "global_step/max_steps": "1662/6362"} +{"lm loss": 5.06211901, "grad_norm": 1.14376628, "learning_rate": 8.862e-05, "elapsed_time_per_iteration": 6.64630127, "memory(GiB)": 21.51, "elapsed_time": "3h 2m 11s", "remaining_time": "8h 34m 47s", "loss_scale": 1.0, "consumed_samples": 425728, "global_step/max_steps": "1663/6362"} +{"lm loss": 5.05348158, "grad_norm": 0.7869103, "learning_rate": 8.861e-05, "elapsed_time_per_iteration": 6.59443212, "memory(GiB)": 21.51, "elapsed_time": "3h 2m 17s", "remaining_time": "8h 34m 41s", "loss_scale": 1.0, "consumed_samples": 425984, "global_step/max_steps": "1664/6362"} +{"lm loss": 5.07317066, "grad_norm": 0.7597267, "learning_rate": 8.859e-05, "elapsed_time_per_iteration": 6.44612622, "memory(GiB)": 21.51, "elapsed_time": "3h 2m 24s", "remaining_time": "8h 34m 34s", "loss_scale": 1.0, "consumed_samples": 426240, "global_step/max_steps": "1665/6362"} +{"lm loss": 5.0571785, "grad_norm": 0.9569478, "learning_rate": 8.858e-05, "elapsed_time_per_iteration": 6.40890932, "memory(GiB)": 21.51, "elapsed_time": "3h 2m 30s", "remaining_time": "8h 34m 27s", "loss_scale": 1.0, "consumed_samples": 426496, "global_step/max_steps": "1666/6362"} +{"lm loss": 5.05007362, "grad_norm": 0.9363687, "learning_rate": 8.856e-05, "elapsed_time_per_iteration": 6.43697166, "memory(GiB)": 21.51, "elapsed_time": "3h 2m 37s", "remaining_time": "8h 34m 20s", "loss_scale": 1.0, "consumed_samples": 426752, "global_step/max_steps": "1667/6362"} +{"lm loss": 5.05945683, "grad_norm": 0.83393031, "learning_rate": 8.854e-05, "elapsed_time_per_iteration": 6.79981375, "memory(GiB)": 21.51, "elapsed_time": "3h 2m 44s", "remaining_time": "8h 34m 14s", "loss_scale": 1.0, "consumed_samples": 427008, "global_step/max_steps": "1668/6362"} +{"lm loss": 5.06114912, "grad_norm": 0.73449415, "learning_rate": 8.853e-05, "elapsed_time_per_iteration": 6.62023067, "memory(GiB)": 21.51, "elapsed_time": "3h 2m 50s", "remaining_time": "8h 34m 8s", "loss_scale": 1.0, "consumed_samples": 427264, "global_step/max_steps": "1669/6362"} +{"lm loss": 5.04535627, "grad_norm": 0.77370793, "learning_rate": 8.851e-05, "elapsed_time_per_iteration": 6.5497241, "memory(GiB)": 21.51, "elapsed_time": "3h 2m 57s", "remaining_time": "8h 34m 1s", "loss_scale": 1.0, "consumed_samples": 427520, "global_step/max_steps": "1670/6362"} +{"lm loss": 5.07738352, "grad_norm": 0.71181077, "learning_rate": 8.849e-05, "elapsed_time_per_iteration": 6.42936754, "memory(GiB)": 21.51, "elapsed_time": "3h 3m 3s", "remaining_time": "8h 33m 54s", "loss_scale": 1.0, "consumed_samples": 427776, "global_step/max_steps": "1671/6362"} +{"lm loss": 5.04897976, "grad_norm": 0.70897752, "learning_rate": 8.848e-05, "elapsed_time_per_iteration": 6.33249998, "memory(GiB)": 21.51, "elapsed_time": "3h 3m 9s", "remaining_time": "8h 33m 47s", "loss_scale": 1.0, "consumed_samples": 428032, "global_step/max_steps": "1672/6362"} +{"lm loss": 5.04858303, "grad_norm": 0.77696818, "learning_rate": 8.846e-05, "elapsed_time_per_iteration": 6.55718279, "memory(GiB)": 21.51, "elapsed_time": "3h 3m 16s", "remaining_time": "8h 33m 40s", "loss_scale": 1.0, "consumed_samples": 428288, "global_step/max_steps": "1673/6362"} +{"lm loss": 5.04451084, "grad_norm": 0.74589556, "learning_rate": 8.844e-05, "elapsed_time_per_iteration": 6.46504641, "memory(GiB)": 21.51, "elapsed_time": "3h 3m 22s", "remaining_time": "8h 33m 33s", "loss_scale": 1.0, "consumed_samples": 428544, "global_step/max_steps": "1674/6362"} +{"lm loss": 5.05914974, "grad_norm": 0.74997514, "learning_rate": 8.843e-05, "elapsed_time_per_iteration": 6.42209339, "memory(GiB)": 21.51, "elapsed_time": "3h 3m 29s", "remaining_time": "8h 33m 26s", "loss_scale": 1.0, "consumed_samples": 428800, "global_step/max_steps": "1675/6362"} +{"lm loss": 5.06950712, "grad_norm": 0.67886883, "learning_rate": 8.841e-05, "elapsed_time_per_iteration": 6.65220642, "memory(GiB)": 21.51, "elapsed_time": "3h 3m 36s", "remaining_time": "8h 33m 20s", "loss_scale": 1.0, "consumed_samples": 429056, "global_step/max_steps": "1676/6362"} +{"lm loss": 5.06366587, "grad_norm": 0.60484654, "learning_rate": 8.84e-05, "elapsed_time_per_iteration": 6.32820368, "memory(GiB)": 21.51, "elapsed_time": "3h 3m 42s", "remaining_time": "8h 33m 13s", "loss_scale": 1.0, "consumed_samples": 429312, "global_step/max_steps": "1677/6362"} +{"lm loss": 5.05871201, "grad_norm": 0.64177823, "learning_rate": 8.838e-05, "elapsed_time_per_iteration": 6.4505446, "memory(GiB)": 21.51, "elapsed_time": "3h 3m 48s", "remaining_time": "8h 33m 6s", "loss_scale": 1.0, "consumed_samples": 429568, "global_step/max_steps": "1678/6362"} +{"lm loss": 5.0591836, "grad_norm": 0.58958149, "learning_rate": 8.836e-05, "elapsed_time_per_iteration": 6.42945695, "memory(GiB)": 21.51, "elapsed_time": "3h 3m 55s", "remaining_time": "8h 32m 59s", "loss_scale": 1.0, "consumed_samples": 429824, "global_step/max_steps": "1679/6362"} +{"lm loss": 5.05500841, "grad_norm": 0.60432035, "learning_rate": 8.835e-05, "elapsed_time_per_iteration": 6.48487663, "memory(GiB)": 21.51, "elapsed_time": "3h 4m 1s", "remaining_time": "8h 32m 52s", "loss_scale": 1.0, "consumed_samples": 430080, "global_step/max_steps": "1680/6362"} +{"lm loss": 5.07697773, "grad_norm": 0.650464, "learning_rate": 8.833e-05, "elapsed_time_per_iteration": 6.59229779, "memory(GiB)": 21.51, "elapsed_time": "3h 4m 8s", "remaining_time": "8h 32m 45s", "loss_scale": 1.0, "consumed_samples": 430336, "global_step/max_steps": "1681/6362"} +{"lm loss": 5.05967188, "grad_norm": 0.7686578, "learning_rate": 8.831e-05, "elapsed_time_per_iteration": 6.50812554, "memory(GiB)": 21.51, "elapsed_time": "3h 4m 14s", "remaining_time": "8h 32m 39s", "loss_scale": 1.0, "consumed_samples": 430592, "global_step/max_steps": "1682/6362"} +{"lm loss": 5.06958437, "grad_norm": 0.70265901, "learning_rate": 8.83e-05, "elapsed_time_per_iteration": 6.5702939, "memory(GiB)": 21.51, "elapsed_time": "3h 4m 21s", "remaining_time": "8h 32m 32s", "loss_scale": 1.0, "consumed_samples": 430848, "global_step/max_steps": "1683/6362"} +{"lm loss": 5.08160591, "grad_norm": 0.76265973, "learning_rate": 8.828e-05, "elapsed_time_per_iteration": 6.55469894, "memory(GiB)": 21.51, "elapsed_time": "3h 4m 27s", "remaining_time": "8h 32m 25s", "loss_scale": 1.0, "consumed_samples": 431104, "global_step/max_steps": "1684/6362"} +{"lm loss": 5.04661798, "grad_norm": 0.76056671, "learning_rate": 8.826e-05, "elapsed_time_per_iteration": 6.50923991, "memory(GiB)": 21.51, "elapsed_time": "3h 4m 34s", "remaining_time": "8h 32m 19s", "loss_scale": 1.0, "consumed_samples": 431360, "global_step/max_steps": "1685/6362"} +{"lm loss": 5.07486868, "grad_norm": 0.82472676, "learning_rate": 8.825e-05, "elapsed_time_per_iteration": 6.71306705, "memory(GiB)": 21.51, "elapsed_time": "3h 4m 41s", "remaining_time": "8h 32m 12s", "loss_scale": 1.0, "consumed_samples": 431616, "global_step/max_steps": "1686/6362"} +{"lm loss": 5.06286383, "grad_norm": 0.83668661, "learning_rate": 8.823e-05, "elapsed_time_per_iteration": 6.69059753, "memory(GiB)": 21.51, "elapsed_time": "3h 4m 47s", "remaining_time": "8h 32m 6s", "loss_scale": 1.0, "consumed_samples": 431872, "global_step/max_steps": "1687/6362"} +{"lm loss": 5.08564568, "grad_norm": 0.8724317, "learning_rate": 8.822e-05, "elapsed_time_per_iteration": 6.69430447, "memory(GiB)": 21.51, "elapsed_time": "3h 4m 54s", "remaining_time": "8h 32m 0s", "loss_scale": 1.0, "consumed_samples": 432128, "global_step/max_steps": "1688/6362"} +{"lm loss": 5.06981325, "grad_norm": 0.83568281, "learning_rate": 8.82e-05, "elapsed_time_per_iteration": 6.36144018, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 0s", "remaining_time": "8h 31m 53s", "loss_scale": 1.0, "consumed_samples": 432384, "global_step/max_steps": "1689/6362"} +{"lm loss": 5.06716537, "grad_norm": 0.74701488, "learning_rate": 8.818e-05, "elapsed_time_per_iteration": 6.53378534, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 7s", "remaining_time": "8h 31m 46s", "loss_scale": 1.0, "consumed_samples": 432640, "global_step/max_steps": "1690/6362"} +{"lm loss": 5.06971216, "grad_norm": 0.84174216, "learning_rate": 8.817e-05, "elapsed_time_per_iteration": 6.36158586, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 13s", "remaining_time": "8h 31m 39s", "loss_scale": 1.0, "consumed_samples": 432896, "global_step/max_steps": "1691/6362"} +{"lm loss": 5.06369686, "grad_norm": 1.00582814, "learning_rate": 8.815e-05, "elapsed_time_per_iteration": 6.49449992, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 20s", "remaining_time": "8h 31m 32s", "loss_scale": 1.0, "consumed_samples": 433152, "global_step/max_steps": "1692/6362"} +{"lm loss": 5.05155087, "grad_norm": 1.00725305, "learning_rate": 8.813e-05, "elapsed_time_per_iteration": 6.72330642, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 27s", "remaining_time": "8h 31m 26s", "loss_scale": 1.0, "consumed_samples": 433408, "global_step/max_steps": "1693/6362"} +{"lm loss": 5.08250809, "grad_norm": 0.7276563, "learning_rate": 8.812e-05, "elapsed_time_per_iteration": 6.50509119, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 33s", "remaining_time": "8h 31m 19s", "loss_scale": 1.0, "consumed_samples": 433664, "global_step/max_steps": "1694/6362"} +{"lm loss": 5.07383156, "grad_norm": 0.65294558, "learning_rate": 8.81e-05, "elapsed_time_per_iteration": 6.50132012, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 40s", "remaining_time": "8h 31m 13s", "loss_scale": 1.0, "consumed_samples": 433920, "global_step/max_steps": "1695/6362"} +{"lm loss": 5.04603148, "grad_norm": 0.69398248, "learning_rate": 8.808e-05, "elapsed_time_per_iteration": 6.72378755, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 46s", "remaining_time": "8h 31m 6s", "loss_scale": 1.0, "consumed_samples": 434176, "global_step/max_steps": "1696/6362"} +{"lm loss": 5.05711222, "grad_norm": 0.67144489, "learning_rate": 8.807e-05, "elapsed_time_per_iteration": 6.58449697, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 53s", "remaining_time": "8h 31m 0s", "loss_scale": 1.0, "consumed_samples": 434432, "global_step/max_steps": "1697/6362"} +{"lm loss": 5.04625559, "grad_norm": 0.66596437, "learning_rate": 8.805e-05, "elapsed_time_per_iteration": 6.40710044, "memory(GiB)": 21.51, "elapsed_time": "3h 5m 59s", "remaining_time": "8h 30m 53s", "loss_scale": 1.0, "consumed_samples": 434688, "global_step/max_steps": "1698/6362"} +{"lm loss": 5.05120897, "grad_norm": 0.66458637, "learning_rate": 8.803e-05, "elapsed_time_per_iteration": 6.83266568, "memory(GiB)": 21.51, "elapsed_time": "3h 6m 6s", "remaining_time": "8h 30m 47s", "loss_scale": 1.0, "consumed_samples": 434944, "global_step/max_steps": "1699/6362"} +{"lm loss": 5.03296709, "grad_norm": 0.72898799, "learning_rate": 8.802e-05, "elapsed_time_per_iteration": 6.6284399, "memory(GiB)": 21.51, "elapsed_time": "3h 6m 13s", "remaining_time": "8h 30m 41s", "loss_scale": 1.0, "consumed_samples": 435200, "global_step/max_steps": "1700/6362"} +{"lm loss": 5.06558084, "grad_norm": 0.86473733, "learning_rate": 8.8e-05, "elapsed_time_per_iteration": 6.53275108, "memory(GiB)": 21.51, "elapsed_time": "3h 6m 19s", "remaining_time": "8h 30m 34s", "loss_scale": 1.0, "consumed_samples": 435456, "global_step/max_steps": "1701/6362"} +{"lm loss": 5.07654524, "grad_norm": 0.98258215, "learning_rate": 8.798e-05, "elapsed_time_per_iteration": 6.58288074, "memory(GiB)": 21.51, "elapsed_time": "3h 6m 26s", "remaining_time": "8h 30m 27s", "loss_scale": 1.0, "consumed_samples": 435712, "global_step/max_steps": "1702/6362"} +{"lm loss": 5.07807493, "grad_norm": 0.96679604, "learning_rate": 8.797e-05, "elapsed_time_per_iteration": 6.54291964, "memory(GiB)": 21.51, "elapsed_time": "3h 6m 32s", "remaining_time": "8h 30m 21s", "loss_scale": 1.0, "consumed_samples": 435968, "global_step/max_steps": "1703/6362"} +{"lm loss": 5.0696106, "grad_norm": 0.88251901, "learning_rate": 8.795e-05, "elapsed_time_per_iteration": 6.645468, "memory(GiB)": 21.51, "elapsed_time": "3h 6m 39s", "remaining_time": "8h 30m 14s", "loss_scale": 1.0, "consumed_samples": 436224, "global_step/max_steps": "1704/6362"} +{"lm loss": 5.0544157, "grad_norm": 0.70031357, "learning_rate": 8.793e-05, "elapsed_time_per_iteration": 6.42837262, "memory(GiB)": 21.51, "elapsed_time": "3h 6m 45s", "remaining_time": "8h 30m 7s", "loss_scale": 1.0, "consumed_samples": 436480, "global_step/max_steps": "1705/6362"} +{"lm loss": 5.03782558, "grad_norm": 0.73473644, "learning_rate": 8.792e-05, "elapsed_time_per_iteration": 6.45738339, "memory(GiB)": 21.51, "elapsed_time": "3h 6m 52s", "remaining_time": "8h 30m 0s", "loss_scale": 1.0, "consumed_samples": 436736, "global_step/max_steps": "1706/6362"} +{"lm loss": 5.07566071, "grad_norm": 0.80554789, "learning_rate": 8.79e-05, "elapsed_time_per_iteration": 6.55212092, "memory(GiB)": 21.51, "elapsed_time": "3h 6m 58s", "remaining_time": "8h 29m 54s", "loss_scale": 1.0, "consumed_samples": 436992, "global_step/max_steps": "1707/6362"} +{"lm loss": 5.03747082, "grad_norm": 0.76490706, "learning_rate": 8.788e-05, "elapsed_time_per_iteration": 6.56092834, "memory(GiB)": 21.51, "elapsed_time": "3h 7m 5s", "remaining_time": "8h 29m 47s", "loss_scale": 1.0, "consumed_samples": 437248, "global_step/max_steps": "1708/6362"} +{"lm loss": 5.07197809, "grad_norm": 0.73912132, "learning_rate": 8.787e-05, "elapsed_time_per_iteration": 6.64250183, "memory(GiB)": 21.51, "elapsed_time": "3h 7m 12s", "remaining_time": "8h 29m 41s", "loss_scale": 1.0, "consumed_samples": 437504, "global_step/max_steps": "1709/6362"} +{"lm loss": 5.05338478, "grad_norm": 0.64192235, "learning_rate": 8.785e-05, "elapsed_time_per_iteration": 6.56968498, "memory(GiB)": 21.51, "elapsed_time": "3h 7m 18s", "remaining_time": "8h 29m 34s", "loss_scale": 1.0, "consumed_samples": 437760, "global_step/max_steps": "1710/6362"} +{"lm loss": 5.08153343, "grad_norm": 0.71967047, "learning_rate": 8.783e-05, "elapsed_time_per_iteration": 6.53114152, "memory(GiB)": 21.51, "elapsed_time": "3h 7m 25s", "remaining_time": "8h 29m 28s", "loss_scale": 1.0, "consumed_samples": 438016, "global_step/max_steps": "1711/6362"} +{"lm loss": 5.05472279, "grad_norm": 0.7893104, "learning_rate": 8.782e-05, "elapsed_time_per_iteration": 6.83744454, "memory(GiB)": 21.51, "elapsed_time": "3h 7m 32s", "remaining_time": "8h 29m 22s", "loss_scale": 1.0, "consumed_samples": 438272, "global_step/max_steps": "1712/6362"} +{"lm loss": 5.06549978, "grad_norm": 0.84163654, "learning_rate": 8.78e-05, "elapsed_time_per_iteration": 6.53254247, "memory(GiB)": 21.51, "elapsed_time": "3h 7m 38s", "remaining_time": "8h 29m 15s", "loss_scale": 1.0, "consumed_samples": 438528, "global_step/max_steps": "1713/6362"} +{"lm loss": 5.04559088, "grad_norm": 0.86567819, "learning_rate": 8.778e-05, "elapsed_time_per_iteration": 6.69651031, "memory(GiB)": 21.51, "elapsed_time": "3h 7m 45s", "remaining_time": "8h 29m 9s", "loss_scale": 1.0, "consumed_samples": 438784, "global_step/max_steps": "1714/6362"} +{"lm loss": 5.04885244, "grad_norm": 0.6979301, "learning_rate": 8.777e-05, "elapsed_time_per_iteration": 6.49691796, "memory(GiB)": 21.51, "elapsed_time": "3h 7m 51s", "remaining_time": "8h 29m 2s", "loss_scale": 1.0, "consumed_samples": 439040, "global_step/max_steps": "1715/6362"} +{"lm loss": 5.08470345, "grad_norm": 0.64685243, "learning_rate": 8.775e-05, "elapsed_time_per_iteration": 6.65851665, "memory(GiB)": 21.51, "elapsed_time": "3h 7m 58s", "remaining_time": "8h 28m 56s", "loss_scale": 1.0, "consumed_samples": 439296, "global_step/max_steps": "1716/6362"} +{"lm loss": 5.05012703, "grad_norm": 0.80553669, "learning_rate": 8.773e-05, "elapsed_time_per_iteration": 6.73618889, "memory(GiB)": 21.51, "elapsed_time": "3h 8m 5s", "remaining_time": "8h 28m 50s", "loss_scale": 1.0, "consumed_samples": 439552, "global_step/max_steps": "1717/6362"} +{"lm loss": 5.04466581, "grad_norm": 0.74004811, "learning_rate": 8.772e-05, "elapsed_time_per_iteration": 6.62633109, "memory(GiB)": 21.51, "elapsed_time": "3h 8m 11s", "remaining_time": "8h 28m 43s", "loss_scale": 1.0, "consumed_samples": 439808, "global_step/max_steps": "1718/6362"} +{"lm loss": 5.0594511, "grad_norm": 0.72269905, "learning_rate": 8.77e-05, "elapsed_time_per_iteration": 6.49782395, "memory(GiB)": 21.51, "elapsed_time": "3h 8m 18s", "remaining_time": "8h 28m 36s", "loss_scale": 1.0, "consumed_samples": 440064, "global_step/max_steps": "1719/6362"} +{"lm loss": 5.0634284, "grad_norm": 0.73579323, "learning_rate": 8.768e-05, "elapsed_time_per_iteration": 6.53920794, "memory(GiB)": 21.51, "elapsed_time": "3h 8m 24s", "remaining_time": "8h 28m 30s", "loss_scale": 1.0, "consumed_samples": 440320, "global_step/max_steps": "1720/6362"} +{"lm loss": 5.04654169, "grad_norm": 0.63502938, "learning_rate": 8.767e-05, "elapsed_time_per_iteration": 6.34890032, "memory(GiB)": 21.51, "elapsed_time": "3h 8m 31s", "remaining_time": "8h 28m 22s", "loss_scale": 1.0, "consumed_samples": 440576, "global_step/max_steps": "1721/6362"} +{"lm loss": 5.05508471, "grad_norm": 0.73535341, "learning_rate": 8.765e-05, "elapsed_time_per_iteration": 6.41184616, "memory(GiB)": 21.51, "elapsed_time": "3h 8m 37s", "remaining_time": "8h 28m 15s", "loss_scale": 1.0, "consumed_samples": 440832, "global_step/max_steps": "1722/6362"} +{"lm loss": 5.07203341, "grad_norm": 0.78716099, "learning_rate": 8.763e-05, "elapsed_time_per_iteration": 6.8718946, "memory(GiB)": 21.51, "elapsed_time": "3h 8m 44s", "remaining_time": "8h 28m 10s", "loss_scale": 1.0, "consumed_samples": 441088, "global_step/max_steps": "1723/6362"} +{"lm loss": 5.0474, "grad_norm": 0.8339138, "learning_rate": 8.762e-05, "elapsed_time_per_iteration": 6.42934537, "memory(GiB)": 21.51, "elapsed_time": "3h 8m 50s", "remaining_time": "8h 28m 3s", "loss_scale": 1.0, "consumed_samples": 441344, "global_step/max_steps": "1724/6362"} +{"lm loss": 5.06511021, "grad_norm": 0.74366802, "learning_rate": 8.76e-05, "elapsed_time_per_iteration": 6.22925758, "memory(GiB)": 21.51, "elapsed_time": "3h 8m 57s", "remaining_time": "8h 27m 55s", "loss_scale": 1.0, "consumed_samples": 441600, "global_step/max_steps": "1725/6362"} +{"lm loss": 5.0785079, "grad_norm": 0.73818886, "learning_rate": 8.758e-05, "elapsed_time_per_iteration": 6.41430402, "memory(GiB)": 21.51, "elapsed_time": "3h 9m 3s", "remaining_time": "8h 27m 48s", "loss_scale": 1.0, "consumed_samples": 441856, "global_step/max_steps": "1726/6362"} +{"lm loss": 5.0583787, "grad_norm": 0.72730011, "learning_rate": 8.757e-05, "elapsed_time_per_iteration": 6.53678083, "memory(GiB)": 21.51, "elapsed_time": "3h 9m 10s", "remaining_time": "8h 27m 42s", "loss_scale": 1.0, "consumed_samples": 442112, "global_step/max_steps": "1727/6362"} +{"lm loss": 5.05653095, "grad_norm": 0.72403389, "learning_rate": 8.755e-05, "elapsed_time_per_iteration": 6.71996355, "memory(GiB)": 21.51, "elapsed_time": "3h 9m 16s", "remaining_time": "8h 27m 35s", "loss_scale": 1.0, "consumed_samples": 442368, "global_step/max_steps": "1728/6362"} +{"lm loss": 5.04378891, "grad_norm": 0.74597883, "learning_rate": 8.753e-05, "elapsed_time_per_iteration": 6.59286666, "memory(GiB)": 21.51, "elapsed_time": "3h 9m 23s", "remaining_time": "8h 27m 29s", "loss_scale": 1.0, "consumed_samples": 442624, "global_step/max_steps": "1729/6362"} +{"lm loss": 5.04847956, "grad_norm": 0.87761337, "learning_rate": 8.751e-05, "elapsed_time_per_iteration": 6.54293489, "memory(GiB)": 21.51, "elapsed_time": "3h 9m 30s", "remaining_time": "8h 27m 22s", "loss_scale": 1.0, "consumed_samples": 442880, "global_step/max_steps": "1730/6362"} +{"lm loss": 5.05183363, "grad_norm": 1.02005386, "learning_rate": 8.75e-05, "elapsed_time_per_iteration": 6.56657219, "memory(GiB)": 21.51, "elapsed_time": "3h 9m 36s", "remaining_time": "8h 27m 16s", "loss_scale": 1.0, "consumed_samples": 443136, "global_step/max_steps": "1731/6362"} +{"lm loss": 5.0547576, "grad_norm": 0.95451742, "learning_rate": 8.748e-05, "elapsed_time_per_iteration": 6.41708779, "memory(GiB)": 21.51, "elapsed_time": "3h 9m 43s", "remaining_time": "8h 27m 9s", "loss_scale": 1.0, "consumed_samples": 443392, "global_step/max_steps": "1732/6362"} +{"lm loss": 5.06098366, "grad_norm": 0.92684418, "learning_rate": 8.746e-05, "elapsed_time_per_iteration": 6.37770176, "memory(GiB)": 21.51, "elapsed_time": "3h 9m 49s", "remaining_time": "8h 27m 2s", "loss_scale": 1.0, "consumed_samples": 443648, "global_step/max_steps": "1733/6362"} +{"lm loss": 5.07101345, "grad_norm": 0.88901603, "learning_rate": 8.745e-05, "elapsed_time_per_iteration": 6.4725709, "memory(GiB)": 21.51, "elapsed_time": "3h 9m 55s", "remaining_time": "8h 26m 55s", "loss_scale": 1.0, "consumed_samples": 443904, "global_step/max_steps": "1734/6362"} +{"lm loss": 5.06447315, "grad_norm": 0.77719599, "learning_rate": 8.743e-05, "elapsed_time_per_iteration": 6.61885667, "memory(GiB)": 21.51, "elapsed_time": "3h 10m 2s", "remaining_time": "8h 26m 48s", "loss_scale": 1.0, "consumed_samples": 444160, "global_step/max_steps": "1735/6362"} +{"lm loss": 5.07335234, "grad_norm": 0.64013618, "learning_rate": 8.741e-05, "elapsed_time_per_iteration": 6.57064104, "memory(GiB)": 21.51, "elapsed_time": "3h 10m 9s", "remaining_time": "8h 26m 42s", "loss_scale": 1.0, "consumed_samples": 444416, "global_step/max_steps": "1736/6362"} +{"lm loss": 5.06362629, "grad_norm": 0.7080676, "learning_rate": 8.74e-05, "elapsed_time_per_iteration": 6.67142773, "memory(GiB)": 21.51, "elapsed_time": "3h 10m 15s", "remaining_time": "8h 26m 35s", "loss_scale": 1.0, "consumed_samples": 444672, "global_step/max_steps": "1737/6362"} +{"lm loss": 5.07194901, "grad_norm": 0.64056301, "learning_rate": 8.738e-05, "elapsed_time_per_iteration": 6.67221165, "memory(GiB)": 21.51, "elapsed_time": "3h 10m 22s", "remaining_time": "8h 26m 29s", "loss_scale": 1.0, "consumed_samples": 444928, "global_step/max_steps": "1738/6362"} +{"lm loss": 5.05611086, "grad_norm": 0.67738336, "learning_rate": 8.736e-05, "elapsed_time_per_iteration": 6.52663326, "memory(GiB)": 21.51, "elapsed_time": "3h 10m 28s", "remaining_time": "8h 26m 22s", "loss_scale": 1.0, "consumed_samples": 445184, "global_step/max_steps": "1739/6362"} +{"lm loss": 5.0353899, "grad_norm": 0.60616499, "learning_rate": 8.735e-05, "elapsed_time_per_iteration": 6.55659628, "memory(GiB)": 21.51, "elapsed_time": "3h 10m 35s", "remaining_time": "8h 26m 16s", "loss_scale": 1.0, "consumed_samples": 445440, "global_step/max_steps": "1740/6362"} +{"lm loss": 5.06956577, "grad_norm": 0.62455273, "learning_rate": 8.733e-05, "elapsed_time_per_iteration": 6.75876999, "memory(GiB)": 21.51, "elapsed_time": "3h 10m 42s", "remaining_time": "8h 26m 10s", "loss_scale": 1.0, "consumed_samples": 445696, "global_step/max_steps": "1741/6362"} +{"lm loss": 5.05821037, "grad_norm": 0.59523636, "learning_rate": 8.731e-05, "elapsed_time_per_iteration": 6.81035638, "memory(GiB)": 21.51, "elapsed_time": "3h 10m 49s", "remaining_time": "8h 26m 4s", "loss_scale": 1.0, "consumed_samples": 445952, "global_step/max_steps": "1742/6362"} +{"lm loss": 5.05325699, "grad_norm": 0.62412262, "learning_rate": 8.729e-05, "elapsed_time_per_iteration": 6.54297352, "memory(GiB)": 21.51, "elapsed_time": "3h 10m 55s", "remaining_time": "8h 25m 57s", "loss_scale": 1.0, "consumed_samples": 446208, "global_step/max_steps": "1743/6362"} +{"lm loss": 5.0650878, "grad_norm": 0.5934397, "learning_rate": 8.728e-05, "elapsed_time_per_iteration": 6.57690763, "memory(GiB)": 21.51, "elapsed_time": "3h 11m 2s", "remaining_time": "8h 25m 51s", "loss_scale": 1.0, "consumed_samples": 446464, "global_step/max_steps": "1744/6362"} +{"lm loss": 5.06912231, "grad_norm": 0.69256294, "learning_rate": 8.726e-05, "elapsed_time_per_iteration": 6.48033643, "memory(GiB)": 21.51, "elapsed_time": "3h 11m 8s", "remaining_time": "8h 25m 44s", "loss_scale": 1.0, "consumed_samples": 446720, "global_step/max_steps": "1745/6362"} +{"lm loss": 5.04819202, "grad_norm": 0.87456286, "learning_rate": 8.724e-05, "elapsed_time_per_iteration": 6.51254416, "memory(GiB)": 21.51, "elapsed_time": "3h 11m 15s", "remaining_time": "8h 25m 37s", "loss_scale": 1.0, "consumed_samples": 446976, "global_step/max_steps": "1746/6362"} +{"lm loss": 5.06757832, "grad_norm": 1.19952512, "learning_rate": 8.723e-05, "elapsed_time_per_iteration": 6.59915948, "memory(GiB)": 21.51, "elapsed_time": "3h 11m 21s", "remaining_time": "8h 25m 31s", "loss_scale": 1.0, "consumed_samples": 447232, "global_step/max_steps": "1747/6362"} +{"lm loss": 5.06826973, "grad_norm": 0.86252946, "learning_rate": 8.721e-05, "elapsed_time_per_iteration": 6.71265721, "memory(GiB)": 21.51, "elapsed_time": "3h 11m 28s", "remaining_time": "8h 25m 24s", "loss_scale": 1.0, "consumed_samples": 447488, "global_step/max_steps": "1748/6362"} +{"lm loss": 5.05051756, "grad_norm": 0.73260653, "learning_rate": 8.719e-05, "elapsed_time_per_iteration": 6.79047084, "memory(GiB)": 21.51, "elapsed_time": "3h 11m 35s", "remaining_time": "8h 25m 18s", "loss_scale": 1.0, "consumed_samples": 447744, "global_step/max_steps": "1749/6362"} +{"lm loss": 5.05237293, "grad_norm": 0.6830799, "learning_rate": 8.717e-05, "elapsed_time_per_iteration": 6.58473802, "memory(GiB)": 21.51, "elapsed_time": "3h 11m 41s", "remaining_time": "8h 25m 12s", "loss_scale": 1.0, "consumed_samples": 448000, "global_step/max_steps": "1750/6362"} +{"lm loss": 5.04664278, "grad_norm": 0.74569273, "learning_rate": 8.716e-05, "elapsed_time_per_iteration": 6.67547297, "memory(GiB)": 21.51, "elapsed_time": "3h 11m 48s", "remaining_time": "8h 25m 5s", "loss_scale": 1.0, "consumed_samples": 448256, "global_step/max_steps": "1751/6362"} +{"lm loss": 5.03990364, "grad_norm": 0.7897777, "learning_rate": 8.714e-05, "elapsed_time_per_iteration": 6.7262013, "memory(GiB)": 21.51, "elapsed_time": "3h 11m 55s", "remaining_time": "8h 24m 59s", "loss_scale": 1.0, "consumed_samples": 448512, "global_step/max_steps": "1752/6362"} +{"lm loss": 5.04356861, "grad_norm": 0.76866096, "learning_rate": 8.712e-05, "elapsed_time_per_iteration": 6.48618412, "memory(GiB)": 21.51, "elapsed_time": "3h 12m 1s", "remaining_time": "8h 24m 53s", "loss_scale": 1.0, "consumed_samples": 448768, "global_step/max_steps": "1753/6362"} +{"lm loss": 5.04096174, "grad_norm": 0.77717859, "learning_rate": 8.711e-05, "elapsed_time_per_iteration": 6.82331824, "memory(GiB)": 21.51, "elapsed_time": "3h 12m 8s", "remaining_time": "8h 24m 47s", "loss_scale": 1.0, "consumed_samples": 449024, "global_step/max_steps": "1754/6362"} +{"lm loss": 5.06518888, "grad_norm": 0.83263361, "learning_rate": 8.709e-05, "elapsed_time_per_iteration": 6.63550735, "memory(GiB)": 21.51, "elapsed_time": "3h 12m 15s", "remaining_time": "8h 24m 40s", "loss_scale": 1.0, "consumed_samples": 449280, "global_step/max_steps": "1755/6362"} +{"lm loss": 5.03760576, "grad_norm": 0.81613731, "learning_rate": 8.707e-05, "elapsed_time_per_iteration": 6.45125842, "memory(GiB)": 21.51, "elapsed_time": "3h 12m 21s", "remaining_time": "8h 24m 33s", "loss_scale": 1.0, "consumed_samples": 449536, "global_step/max_steps": "1756/6362"} +{"lm loss": 5.04665852, "grad_norm": 0.78369409, "learning_rate": 8.706e-05, "elapsed_time_per_iteration": 6.60893726, "memory(GiB)": 21.51, "elapsed_time": "3h 12m 28s", "remaining_time": "8h 24m 27s", "loss_scale": 1.0, "consumed_samples": 449792, "global_step/max_steps": "1757/6362"} +{"lm loss": 5.05678415, "grad_norm": 0.62875849, "learning_rate": 8.704e-05, "elapsed_time_per_iteration": 6.30992126, "memory(GiB)": 21.51, "elapsed_time": "3h 12m 34s", "remaining_time": "8h 24m 20s", "loss_scale": 1.0, "consumed_samples": 450048, "global_step/max_steps": "1758/6362"} +{"lm loss": 5.04093933, "grad_norm": 0.58236647, "learning_rate": 8.702e-05, "elapsed_time_per_iteration": 6.68657207, "memory(GiB)": 21.51, "elapsed_time": "3h 12m 41s", "remaining_time": "8h 24m 13s", "loss_scale": 1.0, "consumed_samples": 450304, "global_step/max_steps": "1759/6362"} +{"lm loss": 5.06044197, "grad_norm": 0.72423887, "learning_rate": 8.7e-05, "elapsed_time_per_iteration": 6.5079751, "memory(GiB)": 21.51, "elapsed_time": "3h 12m 47s", "remaining_time": "8h 24m 7s", "loss_scale": 1.0, "consumed_samples": 450560, "global_step/max_steps": "1760/6362"} +{"lm loss": 5.0625329, "grad_norm": 0.72308916, "learning_rate": 8.699e-05, "elapsed_time_per_iteration": 6.6082356, "memory(GiB)": 21.51, "elapsed_time": "3h 12m 54s", "remaining_time": "8h 24m 0s", "loss_scale": 1.0, "consumed_samples": 450816, "global_step/max_steps": "1761/6362"} +{"lm loss": 5.05559635, "grad_norm": 0.77742261, "learning_rate": 8.697e-05, "elapsed_time_per_iteration": 6.60420966, "memory(GiB)": 21.51, "elapsed_time": "3h 13m 0s", "remaining_time": "8h 23m 54s", "loss_scale": 1.0, "consumed_samples": 451072, "global_step/max_steps": "1762/6362"} +{"lm loss": 5.06658125, "grad_norm": 0.93415964, "learning_rate": 8.695e-05, "elapsed_time_per_iteration": 6.5805974, "memory(GiB)": 21.51, "elapsed_time": "3h 13m 7s", "remaining_time": "8h 23m 47s", "loss_scale": 1.0, "consumed_samples": 451328, "global_step/max_steps": "1763/6362"} +{"lm loss": 5.04974318, "grad_norm": 1.10927474, "learning_rate": 8.693e-05, "elapsed_time_per_iteration": 6.66480517, "memory(GiB)": 21.51, "elapsed_time": "3h 13m 14s", "remaining_time": "8h 23m 41s", "loss_scale": 1.0, "consumed_samples": 451584, "global_step/max_steps": "1764/6362"} +{"lm loss": 5.04816389, "grad_norm": 0.84871131, "learning_rate": 8.692e-05, "elapsed_time_per_iteration": 6.52468324, "memory(GiB)": 21.51, "elapsed_time": "3h 13m 20s", "remaining_time": "8h 23m 34s", "loss_scale": 1.0, "consumed_samples": 451840, "global_step/max_steps": "1765/6362"} +{"lm loss": 5.0441556, "grad_norm": 0.71555203, "learning_rate": 8.69e-05, "elapsed_time_per_iteration": 6.57407832, "memory(GiB)": 21.51, "elapsed_time": "3h 13m 27s", "remaining_time": "8h 23m 27s", "loss_scale": 1.0, "consumed_samples": 452096, "global_step/max_steps": "1766/6362"} +{"lm loss": 5.0606823, "grad_norm": 0.80182242, "learning_rate": 8.688e-05, "elapsed_time_per_iteration": 6.65761542, "memory(GiB)": 21.51, "elapsed_time": "3h 13m 33s", "remaining_time": "8h 23m 21s", "loss_scale": 1.0, "consumed_samples": 452352, "global_step/max_steps": "1767/6362"} +{"lm loss": 5.05456209, "grad_norm": 0.74496925, "learning_rate": 8.687e-05, "elapsed_time_per_iteration": 6.45965958, "memory(GiB)": 21.51, "elapsed_time": "3h 13m 40s", "remaining_time": "8h 23m 14s", "loss_scale": 1.0, "consumed_samples": 452608, "global_step/max_steps": "1768/6362"} +{"lm loss": 5.0520668, "grad_norm": 0.70234793, "learning_rate": 8.685e-05, "elapsed_time_per_iteration": 6.67753577, "memory(GiB)": 21.51, "elapsed_time": "3h 13m 47s", "remaining_time": "8h 23m 8s", "loss_scale": 1.0, "consumed_samples": 452864, "global_step/max_steps": "1769/6362"} +{"lm loss": 5.03822374, "grad_norm": 0.62218767, "learning_rate": 8.683e-05, "elapsed_time_per_iteration": 6.5971601, "memory(GiB)": 21.51, "elapsed_time": "3h 13m 53s", "remaining_time": "8h 23m 1s", "loss_scale": 1.0, "consumed_samples": 453120, "global_step/max_steps": "1770/6362"} +{"lm loss": 5.07008505, "grad_norm": 0.70944124, "learning_rate": 8.681e-05, "elapsed_time_per_iteration": 6.55800486, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 0s", "remaining_time": "8h 22m 55s", "loss_scale": 1.0, "consumed_samples": 453376, "global_step/max_steps": "1771/6362"} +{"lm loss": 5.06111479, "grad_norm": 0.79540169, "learning_rate": 8.68e-05, "elapsed_time_per_iteration": 6.90262794, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 7s", "remaining_time": "8h 22m 49s", "loss_scale": 1.0, "consumed_samples": 453632, "global_step/max_steps": "1772/6362"} +{"lm loss": 5.05266237, "grad_norm": 0.85456502, "learning_rate": 8.678e-05, "elapsed_time_per_iteration": 6.60156465, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 13s", "remaining_time": "8h 22m 43s", "loss_scale": 1.0, "consumed_samples": 453888, "global_step/max_steps": "1773/6362"} +{"lm loss": 5.04968977, "grad_norm": 0.82823426, "learning_rate": 8.676e-05, "elapsed_time_per_iteration": 6.43304658, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 20s", "remaining_time": "8h 22m 36s", "loss_scale": 1.0, "consumed_samples": 454144, "global_step/max_steps": "1774/6362"} +{"lm loss": 5.05575275, "grad_norm": 0.89490157, "learning_rate": 8.674e-05, "elapsed_time_per_iteration": 6.34814429, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 26s", "remaining_time": "8h 22m 28s", "loss_scale": 1.0, "consumed_samples": 454400, "global_step/max_steps": "1775/6362"} +{"lm loss": 5.04972887, "grad_norm": 0.84200406, "learning_rate": 8.673e-05, "elapsed_time_per_iteration": 6.56580853, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 33s", "remaining_time": "8h 22m 22s", "loss_scale": 1.0, "consumed_samples": 454656, "global_step/max_steps": "1776/6362"} +{"lm loss": 5.06186056, "grad_norm": 0.70021707, "learning_rate": 8.671e-05, "elapsed_time_per_iteration": 6.39061379, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 39s", "remaining_time": "8h 22m 15s", "loss_scale": 1.0, "consumed_samples": 454912, "global_step/max_steps": "1777/6362"} +{"lm loss": 5.05999899, "grad_norm": 0.71814722, "learning_rate": 8.669e-05, "elapsed_time_per_iteration": 6.68065906, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 46s", "remaining_time": "8h 22m 9s", "loss_scale": 1.0, "consumed_samples": 455168, "global_step/max_steps": "1778/6362"} +{"lm loss": 5.03903246, "grad_norm": 0.74150139, "learning_rate": 8.668e-05, "elapsed_time_per_iteration": 6.73464203, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 52s", "remaining_time": "8h 22m 2s", "loss_scale": 1.0, "consumed_samples": 455424, "global_step/max_steps": "1779/6362"} +{"lm loss": 5.03700018, "grad_norm": 0.79179138, "learning_rate": 8.666e-05, "elapsed_time_per_iteration": 6.53217769, "memory(GiB)": 21.51, "elapsed_time": "3h 14m 59s", "remaining_time": "8h 21m 56s", "loss_scale": 1.0, "consumed_samples": 455680, "global_step/max_steps": "1780/6362"} +{"lm loss": 5.07258606, "grad_norm": 0.75683272, "learning_rate": 8.664e-05, "elapsed_time_per_iteration": 6.70221448, "memory(GiB)": 21.51, "elapsed_time": "3h 15m 6s", "remaining_time": "8h 21m 49s", "loss_scale": 1.0, "consumed_samples": 455936, "global_step/max_steps": "1781/6362"} +{"lm loss": 5.04921961, "grad_norm": 0.85164076, "learning_rate": 8.662e-05, "elapsed_time_per_iteration": 6.43057632, "memory(GiB)": 21.51, "elapsed_time": "3h 15m 12s", "remaining_time": "8h 21m 43s", "loss_scale": 1.0, "consumed_samples": 456192, "global_step/max_steps": "1782/6362"} +{"lm loss": 5.0427599, "grad_norm": 0.86827368, "learning_rate": 8.661e-05, "elapsed_time_per_iteration": 6.49514461, "memory(GiB)": 21.51, "elapsed_time": "3h 15m 19s", "remaining_time": "8h 21m 36s", "loss_scale": 1.0, "consumed_samples": 456448, "global_step/max_steps": "1783/6362"} +{"lm loss": 5.04553604, "grad_norm": 0.98716128, "learning_rate": 8.659e-05, "elapsed_time_per_iteration": 6.43130779, "memory(GiB)": 21.51, "elapsed_time": "3h 15m 25s", "remaining_time": "8h 21m 29s", "loss_scale": 1.0, "consumed_samples": 456704, "global_step/max_steps": "1784/6362"} +{"lm loss": 5.07642365, "grad_norm": 0.87957937, "learning_rate": 8.657e-05, "elapsed_time_per_iteration": 6.38354301, "memory(GiB)": 21.51, "elapsed_time": "3h 15m 31s", "remaining_time": "8h 21m 22s", "loss_scale": 1.0, "consumed_samples": 456960, "global_step/max_steps": "1785/6362"} +{"lm loss": 5.0758009, "grad_norm": 0.72892058, "learning_rate": 8.655e-05, "elapsed_time_per_iteration": 6.41403389, "memory(GiB)": 21.51, "elapsed_time": "3h 15m 38s", "remaining_time": "8h 21m 15s", "loss_scale": 1.0, "consumed_samples": 457216, "global_step/max_steps": "1786/6362"} +{"lm loss": 5.05440855, "grad_norm": 0.7209565, "learning_rate": 8.654e-05, "elapsed_time_per_iteration": 6.80495429, "memory(GiB)": 21.51, "elapsed_time": "3h 15m 45s", "remaining_time": "8h 21m 9s", "loss_scale": 1.0, "consumed_samples": 457472, "global_step/max_steps": "1787/6362"} +{"lm loss": 5.04619551, "grad_norm": 0.72611219, "learning_rate": 8.652e-05, "elapsed_time_per_iteration": 6.49893475, "memory(GiB)": 21.51, "elapsed_time": "3h 15m 51s", "remaining_time": "8h 21m 2s", "loss_scale": 1.0, "consumed_samples": 457728, "global_step/max_steps": "1788/6362"} +{"lm loss": 5.04688215, "grad_norm": 0.85103703, "learning_rate": 8.65e-05, "elapsed_time_per_iteration": 6.47485995, "memory(GiB)": 21.51, "elapsed_time": "3h 15m 58s", "remaining_time": "8h 20m 55s", "loss_scale": 1.0, "consumed_samples": 457984, "global_step/max_steps": "1789/6362"} +{"lm loss": 5.05352259, "grad_norm": 0.98248333, "learning_rate": 8.648e-05, "elapsed_time_per_iteration": 6.7594595, "memory(GiB)": 21.51, "elapsed_time": "3h 16m 4s", "remaining_time": "8h 20m 49s", "loss_scale": 1.0, "consumed_samples": 458240, "global_step/max_steps": "1790/6362"} +{"lm loss": 5.0462327, "grad_norm": 1.01132393, "learning_rate": 8.647e-05, "elapsed_time_per_iteration": 6.62629342, "memory(GiB)": 21.51, "elapsed_time": "3h 16m 11s", "remaining_time": "8h 20m 43s", "loss_scale": 1.0, "consumed_samples": 458496, "global_step/max_steps": "1791/6362"} +{"lm loss": 5.03966475, "grad_norm": 0.76959771, "learning_rate": 8.645e-05, "elapsed_time_per_iteration": 6.39862251, "memory(GiB)": 21.51, "elapsed_time": "3h 16m 17s", "remaining_time": "8h 20m 36s", "loss_scale": 1.0, "consumed_samples": 458752, "global_step/max_steps": "1792/6362"} +{"lm loss": 5.0556097, "grad_norm": 0.64990228, "learning_rate": 8.643e-05, "elapsed_time_per_iteration": 6.52290654, "memory(GiB)": 21.51, "elapsed_time": "3h 16m 24s", "remaining_time": "8h 20m 29s", "loss_scale": 1.0, "consumed_samples": 459008, "global_step/max_steps": "1793/6362"} +{"lm loss": 5.0370369, "grad_norm": 0.71945816, "learning_rate": 8.641e-05, "elapsed_time_per_iteration": 6.4486506, "memory(GiB)": 21.51, "elapsed_time": "3h 16m 30s", "remaining_time": "8h 20m 22s", "loss_scale": 1.0, "consumed_samples": 459264, "global_step/max_steps": "1794/6362"} +{"lm loss": 5.06410694, "grad_norm": 0.76558208, "learning_rate": 8.64e-05, "elapsed_time_per_iteration": 6.58374071, "memory(GiB)": 21.51, "elapsed_time": "3h 16m 37s", "remaining_time": "8h 20m 16s", "loss_scale": 1.0, "consumed_samples": 459520, "global_step/max_steps": "1795/6362"} +{"lm loss": 5.04644299, "grad_norm": 0.73404109, "learning_rate": 8.638e-05, "elapsed_time_per_iteration": 6.29432631, "memory(GiB)": 21.51, "elapsed_time": "3h 16m 43s", "remaining_time": "8h 20m 8s", "loss_scale": 1.0, "consumed_samples": 459776, "global_step/max_steps": "1796/6362"} +{"lm loss": 5.06672192, "grad_norm": 0.68897957, "learning_rate": 8.636e-05, "elapsed_time_per_iteration": 6.49847293, "memory(GiB)": 21.51, "elapsed_time": "3h 16m 50s", "remaining_time": "8h 20m 2s", "loss_scale": 1.0, "consumed_samples": 460032, "global_step/max_steps": "1797/6362"} +{"lm loss": 5.06507635, "grad_norm": 0.6868313, "learning_rate": 8.634e-05, "elapsed_time_per_iteration": 6.42674375, "memory(GiB)": 21.51, "elapsed_time": "3h 16m 56s", "remaining_time": "8h 19m 55s", "loss_scale": 1.0, "consumed_samples": 460288, "global_step/max_steps": "1798/6362"} +{"lm loss": 5.03946114, "grad_norm": 0.60782814, "learning_rate": 8.633e-05, "elapsed_time_per_iteration": 6.57660913, "memory(GiB)": 21.51, "elapsed_time": "3h 17m 3s", "remaining_time": "8h 19m 48s", "loss_scale": 1.0, "consumed_samples": 460544, "global_step/max_steps": "1799/6362"} +{"lm loss": 5.03635454, "grad_norm": 0.64618653, "learning_rate": 8.631e-05, "elapsed_time_per_iteration": 6.49645567, "memory(GiB)": 21.51, "elapsed_time": "3h 17m 9s", "remaining_time": "8h 19m 41s", "loss_scale": 1.0, "consumed_samples": 460800, "global_step/max_steps": "1800/6362"} +{"lm loss": 5.06825924, "grad_norm": 0.72743076, "learning_rate": 8.629e-05, "elapsed_time_per_iteration": 6.64836717, "memory(GiB)": 21.51, "elapsed_time": "3h 17m 16s", "remaining_time": "8h 19m 35s", "loss_scale": 1.0, "consumed_samples": 461056, "global_step/max_steps": "1801/6362"} +{"lm loss": 5.02664185, "grad_norm": 0.70921248, "learning_rate": 8.627e-05, "elapsed_time_per_iteration": 6.52428126, "memory(GiB)": 21.51, "elapsed_time": "3h 17m 22s", "remaining_time": "8h 19m 28s", "loss_scale": 1.0, "consumed_samples": 461312, "global_step/max_steps": "1802/6362"} +{"lm loss": 5.04967928, "grad_norm": 0.64396197, "learning_rate": 8.626e-05, "elapsed_time_per_iteration": 6.49692369, "memory(GiB)": 21.51, "elapsed_time": "3h 17m 29s", "remaining_time": "8h 19m 21s", "loss_scale": 1.0, "consumed_samples": 461568, "global_step/max_steps": "1803/6362"} +{"lm loss": 5.02232742, "grad_norm": 0.67037117, "learning_rate": 8.624e-05, "elapsed_time_per_iteration": 6.70478797, "memory(GiB)": 21.51, "elapsed_time": "3h 17m 36s", "remaining_time": "8h 19m 15s", "loss_scale": 1.0, "consumed_samples": 461824, "global_step/max_steps": "1804/6362"} +{"lm loss": 5.05024004, "grad_norm": 0.67169428, "learning_rate": 8.622e-05, "elapsed_time_per_iteration": 6.62519288, "memory(GiB)": 21.51, "elapsed_time": "3h 17m 42s", "remaining_time": "8h 19m 9s", "loss_scale": 1.0, "consumed_samples": 462080, "global_step/max_steps": "1805/6362"} +{"lm loss": 5.03514147, "grad_norm": 0.80958247, "learning_rate": 8.62e-05, "elapsed_time_per_iteration": 6.60409665, "memory(GiB)": 21.51, "elapsed_time": "3h 17m 49s", "remaining_time": "8h 19m 2s", "loss_scale": 1.0, "consumed_samples": 462336, "global_step/max_steps": "1806/6362"} +{"lm loss": 5.04163742, "grad_norm": 0.98071533, "learning_rate": 8.619e-05, "elapsed_time_per_iteration": 6.60991001, "memory(GiB)": 21.51, "elapsed_time": "3h 17m 55s", "remaining_time": "8h 18m 56s", "loss_scale": 1.0, "consumed_samples": 462592, "global_step/max_steps": "1807/6362"} +{"lm loss": 5.05878687, "grad_norm": 1.07087529, "learning_rate": 8.617e-05, "elapsed_time_per_iteration": 6.50002337, "memory(GiB)": 21.51, "elapsed_time": "3h 18m 2s", "remaining_time": "8h 18m 49s", "loss_scale": 1.0, "consumed_samples": 462848, "global_step/max_steps": "1808/6362"} +{"lm loss": 5.03727818, "grad_norm": 0.89650404, "learning_rate": 8.615e-05, "elapsed_time_per_iteration": 6.73340917, "memory(GiB)": 21.51, "elapsed_time": "3h 18m 9s", "remaining_time": "8h 18m 43s", "loss_scale": 1.0, "consumed_samples": 463104, "global_step/max_steps": "1809/6362"} +{"lm loss": 5.04723215, "grad_norm": 0.83070123, "learning_rate": 8.613e-05, "elapsed_time_per_iteration": 6.55434537, "memory(GiB)": 21.51, "elapsed_time": "3h 18m 15s", "remaining_time": "8h 18m 36s", "loss_scale": 1.0, "consumed_samples": 463360, "global_step/max_steps": "1810/6362"} +{"lm loss": 5.04142714, "grad_norm": 0.78399706, "learning_rate": 8.612e-05, "elapsed_time_per_iteration": 6.4256568, "memory(GiB)": 21.51, "elapsed_time": "3h 18m 22s", "remaining_time": "8h 18m 29s", "loss_scale": 1.0, "consumed_samples": 463616, "global_step/max_steps": "1811/6362"} +{"lm loss": 5.04073143, "grad_norm": 0.70665908, "learning_rate": 8.61e-05, "elapsed_time_per_iteration": 6.27572179, "memory(GiB)": 21.51, "elapsed_time": "3h 18m 28s", "remaining_time": "8h 18m 22s", "loss_scale": 1.0, "consumed_samples": 463872, "global_step/max_steps": "1812/6362"} +{"lm loss": 5.04018068, "grad_norm": 0.67607522, "learning_rate": 8.608e-05, "elapsed_time_per_iteration": 6.44758797, "memory(GiB)": 21.51, "elapsed_time": "3h 18m 34s", "remaining_time": "8h 18m 15s", "loss_scale": 1.0, "consumed_samples": 464128, "global_step/max_steps": "1813/6362"} +{"lm loss": 5.04592848, "grad_norm": 0.73436058, "learning_rate": 8.606e-05, "elapsed_time_per_iteration": 6.43984723, "memory(GiB)": 21.51, "elapsed_time": "3h 18m 41s", "remaining_time": "8h 18m 8s", "loss_scale": 1.0, "consumed_samples": 464384, "global_step/max_steps": "1814/6362"} +{"lm loss": 5.03105927, "grad_norm": 0.76938236, "learning_rate": 8.604e-05, "elapsed_time_per_iteration": 6.33521676, "memory(GiB)": 21.51, "elapsed_time": "3h 18m 47s", "remaining_time": "8h 18m 1s", "loss_scale": 1.0, "consumed_samples": 464640, "global_step/max_steps": "1815/6362"} +{"lm loss": 5.04240227, "grad_norm": 0.78752381, "learning_rate": 8.603e-05, "elapsed_time_per_iteration": 6.61281919, "memory(GiB)": 21.51, "elapsed_time": "3h 18m 54s", "remaining_time": "8h 17m 55s", "loss_scale": 1.0, "consumed_samples": 464896, "global_step/max_steps": "1816/6362"} +{"lm loss": 5.05001497, "grad_norm": 0.83838701, "learning_rate": 8.601e-05, "elapsed_time_per_iteration": 6.52616072, "memory(GiB)": 21.51, "elapsed_time": "3h 19m 0s", "remaining_time": "8h 17m 48s", "loss_scale": 1.0, "consumed_samples": 465152, "global_step/max_steps": "1817/6362"} +{"lm loss": 5.03981066, "grad_norm": 0.89845961, "learning_rate": 8.599e-05, "elapsed_time_per_iteration": 6.67027092, "memory(GiB)": 21.51, "elapsed_time": "3h 19m 7s", "remaining_time": "8h 17m 42s", "loss_scale": 1.0, "consumed_samples": 465408, "global_step/max_steps": "1818/6362"} +{"lm loss": 5.03361607, "grad_norm": 0.8327468, "learning_rate": 8.597e-05, "elapsed_time_per_iteration": 6.64593577, "memory(GiB)": 21.51, "elapsed_time": "3h 19m 14s", "remaining_time": "8h 17m 35s", "loss_scale": 1.0, "consumed_samples": 465664, "global_step/max_steps": "1819/6362"} +{"lm loss": 5.03650331, "grad_norm": 0.70383924, "learning_rate": 8.596e-05, "elapsed_time_per_iteration": 6.59097528, "memory(GiB)": 21.51, "elapsed_time": "3h 19m 20s", "remaining_time": "8h 17m 29s", "loss_scale": 1.0, "consumed_samples": 465920, "global_step/max_steps": "1820/6362"} +{"lm loss": 5.01705647, "grad_norm": 0.57255512, "learning_rate": 8.594e-05, "elapsed_time_per_iteration": 6.67901278, "memory(GiB)": 21.51, "elapsed_time": "3h 19m 27s", "remaining_time": "8h 17m 22s", "loss_scale": 1.0, "consumed_samples": 466176, "global_step/max_steps": "1821/6362"} +{"lm loss": 5.03609228, "grad_norm": 0.66045666, "learning_rate": 8.592e-05, "elapsed_time_per_iteration": 6.58908081, "memory(GiB)": 21.51, "elapsed_time": "3h 19m 33s", "remaining_time": "8h 17m 16s", "loss_scale": 1.0, "consumed_samples": 466432, "global_step/max_steps": "1822/6362"} +{"lm loss": 5.04095125, "grad_norm": 0.74305242, "learning_rate": 8.59e-05, "elapsed_time_per_iteration": 6.62785935, "memory(GiB)": 21.51, "elapsed_time": "3h 19m 40s", "remaining_time": "8h 17m 9s", "loss_scale": 1.0, "consumed_samples": 466688, "global_step/max_steps": "1823/6362"} +{"lm loss": 5.05540276, "grad_norm": 0.65844274, "learning_rate": 8.589e-05, "elapsed_time_per_iteration": 6.37111092, "memory(GiB)": 21.51, "elapsed_time": "3h 19m 46s", "remaining_time": "8h 17m 2s", "loss_scale": 1.0, "consumed_samples": 466944, "global_step/max_steps": "1824/6362"} +{"lm loss": 5.05476189, "grad_norm": 0.66476756, "learning_rate": 8.587e-05, "elapsed_time_per_iteration": 6.53341579, "memory(GiB)": 21.51, "elapsed_time": "3h 19m 53s", "remaining_time": "8h 16m 56s", "loss_scale": 1.0, "consumed_samples": 467200, "global_step/max_steps": "1825/6362"} +{"lm loss": 5.03761673, "grad_norm": 0.61698908, "learning_rate": 8.585e-05, "elapsed_time_per_iteration": 6.71580219, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 0s", "remaining_time": "8h 16m 49s", "loss_scale": 1.0, "consumed_samples": 467456, "global_step/max_steps": "1826/6362"} +{"lm loss": 5.04922867, "grad_norm": 0.5129683, "learning_rate": 8.583e-05, "elapsed_time_per_iteration": 6.49079132, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 6s", "remaining_time": "8h 16m 43s", "loss_scale": 1.0, "consumed_samples": 467712, "global_step/max_steps": "1827/6362"} +{"lm loss": 5.04641581, "grad_norm": 0.56306297, "learning_rate": 8.581e-05, "elapsed_time_per_iteration": 6.4024384, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 13s", "remaining_time": "8h 16m 36s", "loss_scale": 1.0, "consumed_samples": 467968, "global_step/max_steps": "1828/6362"} +{"lm loss": 5.04236889, "grad_norm": 0.55584973, "learning_rate": 8.58e-05, "elapsed_time_per_iteration": 6.38499856, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 19s", "remaining_time": "8h 16m 29s", "loss_scale": 1.0, "consumed_samples": 468224, "global_step/max_steps": "1829/6362"} +{"lm loss": 5.08405638, "grad_norm": 0.62423015, "learning_rate": 8.578e-05, "elapsed_time_per_iteration": 6.59391642, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 26s", "remaining_time": "8h 16m 22s", "loss_scale": 1.0, "consumed_samples": 468480, "global_step/max_steps": "1830/6362"} +{"lm loss": 5.04563904, "grad_norm": 0.66321713, "learning_rate": 8.576e-05, "elapsed_time_per_iteration": 6.54882979, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 32s", "remaining_time": "8h 16m 15s", "loss_scale": 1.0, "consumed_samples": 468736, "global_step/max_steps": "1831/6362"} +{"lm loss": 5.03654766, "grad_norm": 0.69916123, "learning_rate": 8.574e-05, "elapsed_time_per_iteration": 6.5647521, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 39s", "remaining_time": "8h 16m 9s", "loss_scale": 1.0, "consumed_samples": 468992, "global_step/max_steps": "1832/6362"} +{"lm loss": 5.0493207, "grad_norm": 0.72802389, "learning_rate": 8.572e-05, "elapsed_time_per_iteration": 6.61192703, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 45s", "remaining_time": "8h 16m 2s", "loss_scale": 1.0, "consumed_samples": 469248, "global_step/max_steps": "1833/6362"} +{"lm loss": 5.04845858, "grad_norm": 0.85753679, "learning_rate": 8.571e-05, "elapsed_time_per_iteration": 6.51579022, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 52s", "remaining_time": "8h 15m 56s", "loss_scale": 1.0, "consumed_samples": 469504, "global_step/max_steps": "1834/6362"} +{"lm loss": 5.04967451, "grad_norm": 1.15453756, "learning_rate": 8.569e-05, "elapsed_time_per_iteration": 6.62109923, "memory(GiB)": 21.51, "elapsed_time": "3h 20m 58s", "remaining_time": "8h 15m 49s", "loss_scale": 1.0, "consumed_samples": 469760, "global_step/max_steps": "1835/6362"} +{"lm loss": 5.04249287, "grad_norm": 1.01133633, "learning_rate": 8.567e-05, "elapsed_time_per_iteration": 6.55710769, "memory(GiB)": 21.51, "elapsed_time": "3h 21m 5s", "remaining_time": "8h 15m 43s", "loss_scale": 1.0, "consumed_samples": 470016, "global_step/max_steps": "1836/6362"} +{"lm loss": 5.02934599, "grad_norm": 0.90761775, "learning_rate": 8.565e-05, "elapsed_time_per_iteration": 6.37348676, "memory(GiB)": 21.51, "elapsed_time": "3h 21m 11s", "remaining_time": "8h 15m 36s", "loss_scale": 1.0, "consumed_samples": 470272, "global_step/max_steps": "1837/6362"} +{"lm loss": 5.05623436, "grad_norm": 0.78610766, "learning_rate": 8.564e-05, "elapsed_time_per_iteration": 6.59366393, "memory(GiB)": 21.51, "elapsed_time": "3h 21m 18s", "remaining_time": "8h 15m 29s", "loss_scale": 1.0, "consumed_samples": 470528, "global_step/max_steps": "1838/6362"} +{"lm loss": 5.03254604, "grad_norm": 0.84308636, "learning_rate": 8.562e-05, "elapsed_time_per_iteration": 6.39768791, "memory(GiB)": 21.51, "elapsed_time": "3h 21m 24s", "remaining_time": "8h 15m 22s", "loss_scale": 1.0, "consumed_samples": 470784, "global_step/max_steps": "1839/6362"} +{"lm loss": 5.06088543, "grad_norm": 0.87545526, "learning_rate": 8.56e-05, "elapsed_time_per_iteration": 6.75285745, "memory(GiB)": 21.51, "elapsed_time": "3h 21m 31s", "remaining_time": "8h 15m 16s", "loss_scale": 1.0, "consumed_samples": 471040, "global_step/max_steps": "1840/6362"} +{"lm loss": 5.03204679, "grad_norm": 0.79744738, "learning_rate": 8.558e-05, "elapsed_time_per_iteration": 6.3980124, "memory(GiB)": 21.51, "elapsed_time": "3h 21m 38s", "remaining_time": "8h 15m 9s", "loss_scale": 1.0, "consumed_samples": 471296, "global_step/max_steps": "1841/6362"} +{"lm loss": 5.00540543, "grad_norm": 0.75986969, "learning_rate": 8.556e-05, "elapsed_time_per_iteration": 6.41655135, "memory(GiB)": 21.51, "elapsed_time": "3h 21m 44s", "remaining_time": "8h 15m 2s", "loss_scale": 1.0, "consumed_samples": 471552, "global_step/max_steps": "1842/6362"} +{"lm loss": 5.03923273, "grad_norm": 0.64226496, "learning_rate": 8.555e-05, "elapsed_time_per_iteration": 6.38117623, "memory(GiB)": 21.51, "elapsed_time": "3h 21m 50s", "remaining_time": "8h 14m 55s", "loss_scale": 1.0, "consumed_samples": 471808, "global_step/max_steps": "1843/6362"} +{"lm loss": 5.04734564, "grad_norm": 0.71674293, "learning_rate": 8.553e-05, "elapsed_time_per_iteration": 6.64675975, "memory(GiB)": 21.51, "elapsed_time": "3h 21m 57s", "remaining_time": "8h 14m 49s", "loss_scale": 1.0, "consumed_samples": 472064, "global_step/max_steps": "1844/6362"} +{"lm loss": 5.04547405, "grad_norm": 0.69118094, "learning_rate": 8.551e-05, "elapsed_time_per_iteration": 6.77080655, "memory(GiB)": 21.51, "elapsed_time": "3h 22m 4s", "remaining_time": "8h 14m 42s", "loss_scale": 1.0, "consumed_samples": 472320, "global_step/max_steps": "1845/6362"} +{"lm loss": 5.02600193, "grad_norm": 0.68996179, "learning_rate": 8.549e-05, "elapsed_time_per_iteration": 6.51755047, "memory(GiB)": 21.51, "elapsed_time": "3h 22m 10s", "remaining_time": "8h 14m 36s", "loss_scale": 1.0, "consumed_samples": 472576, "global_step/max_steps": "1846/6362"} +{"lm loss": 5.0401926, "grad_norm": 0.70436269, "learning_rate": 8.547e-05, "elapsed_time_per_iteration": 6.87306833, "memory(GiB)": 21.51, "elapsed_time": "3h 22m 17s", "remaining_time": "8h 14m 30s", "loss_scale": 1.0, "consumed_samples": 472832, "global_step/max_steps": "1847/6362"} +{"lm loss": 5.04414511, "grad_norm": 0.7254234, "learning_rate": 8.546e-05, "elapsed_time_per_iteration": 6.96065545, "memory(GiB)": 21.51, "elapsed_time": "3h 22m 24s", "remaining_time": "8h 14m 24s", "loss_scale": 1.0, "consumed_samples": 473088, "global_step/max_steps": "1848/6362"} +{"lm loss": 5.0309248, "grad_norm": 0.76084876, "learning_rate": 8.544e-05, "elapsed_time_per_iteration": 6.53765607, "memory(GiB)": 21.51, "elapsed_time": "3h 22m 31s", "remaining_time": "8h 14m 18s", "loss_scale": 1.0, "consumed_samples": 473344, "global_step/max_steps": "1849/6362"} +{"lm loss": 5.0547123, "grad_norm": 0.71228474, "learning_rate": 8.542e-05, "elapsed_time_per_iteration": 6.69096899, "memory(GiB)": 21.51, "elapsed_time": "3h 22m 37s", "remaining_time": "8h 14m 11s", "loss_scale": 1.0, "consumed_samples": 473600, "global_step/max_steps": "1850/6362"} +{"lm loss": 5.04093313, "grad_norm": 0.78026813, "learning_rate": 8.54e-05, "elapsed_time_per_iteration": 6.68381548, "memory(GiB)": 21.51, "elapsed_time": "3h 22m 44s", "remaining_time": "8h 14m 5s", "loss_scale": 1.0, "consumed_samples": 473856, "global_step/max_steps": "1851/6362"} +{"lm loss": 5.02588272, "grad_norm": 0.91182345, "learning_rate": 8.538e-05, "elapsed_time_per_iteration": 6.61566782, "memory(GiB)": 21.51, "elapsed_time": "3h 22m 51s", "remaining_time": "8h 13m 59s", "loss_scale": 1.0, "consumed_samples": 474112, "global_step/max_steps": "1852/6362"} +{"lm loss": 5.0660696, "grad_norm": 0.86097831, "learning_rate": 8.537e-05, "elapsed_time_per_iteration": 6.75496006, "memory(GiB)": 21.51, "elapsed_time": "3h 22m 57s", "remaining_time": "8h 13m 53s", "loss_scale": 1.0, "consumed_samples": 474368, "global_step/max_steps": "1853/6362"} +{"lm loss": 5.03815413, "grad_norm": 0.74968964, "learning_rate": 8.535e-05, "elapsed_time_per_iteration": 6.51810598, "memory(GiB)": 21.51, "elapsed_time": "3h 23m 4s", "remaining_time": "8h 13m 46s", "loss_scale": 1.0, "consumed_samples": 474624, "global_step/max_steps": "1854/6362"} +{"lm loss": 5.01322603, "grad_norm": 0.66154051, "learning_rate": 8.533e-05, "elapsed_time_per_iteration": 6.67474413, "memory(GiB)": 21.51, "elapsed_time": "3h 23m 11s", "remaining_time": "8h 13m 39s", "loss_scale": 1.0, "consumed_samples": 474880, "global_step/max_steps": "1855/6362"} +{"lm loss": 5.04693413, "grad_norm": 0.60041118, "learning_rate": 8.531e-05, "elapsed_time_per_iteration": 6.50922894, "memory(GiB)": 21.51, "elapsed_time": "3h 23m 17s", "remaining_time": "8h 13m 33s", "loss_scale": 1.0, "consumed_samples": 475136, "global_step/max_steps": "1856/6362"} +{"lm loss": 5.04376888, "grad_norm": 0.665353, "learning_rate": 8.529e-05, "elapsed_time_per_iteration": 6.56166029, "memory(GiB)": 21.51, "elapsed_time": "3h 23m 24s", "remaining_time": "8h 13m 26s", "loss_scale": 1.0, "consumed_samples": 475392, "global_step/max_steps": "1857/6362"} +{"lm loss": 5.021029, "grad_norm": 0.69931847, "learning_rate": 8.528e-05, "elapsed_time_per_iteration": 6.35214043, "memory(GiB)": 21.51, "elapsed_time": "3h 23m 30s", "remaining_time": "8h 13m 19s", "loss_scale": 1.0, "consumed_samples": 475648, "global_step/max_steps": "1858/6362"} +{"lm loss": 5.05357552, "grad_norm": 0.76330674, "learning_rate": 8.526e-05, "elapsed_time_per_iteration": 6.42981005, "memory(GiB)": 21.51, "elapsed_time": "3h 23m 36s", "remaining_time": "8h 13m 12s", "loss_scale": 1.0, "consumed_samples": 475904, "global_step/max_steps": "1859/6362"} +{"lm loss": 5.0239687, "grad_norm": 0.81151974, "learning_rate": 8.524e-05, "elapsed_time_per_iteration": 6.44959784, "memory(GiB)": 21.51, "elapsed_time": "3h 23m 43s", "remaining_time": "8h 13m 5s", "loss_scale": 1.0, "consumed_samples": 476160, "global_step/max_steps": "1860/6362"} +{"lm loss": 5.04190731, "grad_norm": 0.83435786, "learning_rate": 8.522e-05, "elapsed_time_per_iteration": 6.69316459, "memory(GiB)": 21.51, "elapsed_time": "3h 23m 50s", "remaining_time": "8h 12m 59s", "loss_scale": 1.0, "consumed_samples": 476416, "global_step/max_steps": "1861/6362"} +{"lm loss": 5.02962017, "grad_norm": 0.8660863, "learning_rate": 8.52e-05, "elapsed_time_per_iteration": 6.66580129, "memory(GiB)": 21.51, "elapsed_time": "3h 23m 56s", "remaining_time": "8h 12m 53s", "loss_scale": 1.0, "consumed_samples": 476672, "global_step/max_steps": "1862/6362"} +{"lm loss": 5.02890968, "grad_norm": 0.8410008, "learning_rate": 8.518e-05, "elapsed_time_per_iteration": 6.55000639, "memory(GiB)": 21.51, "elapsed_time": "3h 24m 3s", "remaining_time": "8h 12m 46s", "loss_scale": 1.0, "consumed_samples": 476928, "global_step/max_steps": "1863/6362"} +{"lm loss": 5.04315186, "grad_norm": 0.6854592, "learning_rate": 8.517e-05, "elapsed_time_per_iteration": 6.38416362, "memory(GiB)": 21.51, "elapsed_time": "3h 24m 9s", "remaining_time": "8h 12m 39s", "loss_scale": 1.0, "consumed_samples": 477184, "global_step/max_steps": "1864/6362"} +{"lm loss": 5.04863882, "grad_norm": 0.6169039, "learning_rate": 8.515e-05, "elapsed_time_per_iteration": 6.56865263, "memory(GiB)": 21.51, "elapsed_time": "3h 24m 16s", "remaining_time": "8h 12m 32s", "loss_scale": 1.0, "consumed_samples": 477440, "global_step/max_steps": "1865/6362"} +{"lm loss": 5.02902031, "grad_norm": 0.79044932, "learning_rate": 8.513e-05, "elapsed_time_per_iteration": 6.37815499, "memory(GiB)": 21.51, "elapsed_time": "3h 24m 22s", "remaining_time": "8h 12m 25s", "loss_scale": 1.0, "consumed_samples": 477696, "global_step/max_steps": "1866/6362"} +{"lm loss": 5.04855585, "grad_norm": 0.6912896, "learning_rate": 8.511e-05, "elapsed_time_per_iteration": 6.62683988, "memory(GiB)": 21.51, "elapsed_time": "3h 24m 29s", "remaining_time": "8h 12m 19s", "loss_scale": 1.0, "consumed_samples": 477952, "global_step/max_steps": "1867/6362"} +{"lm loss": 5.03458452, "grad_norm": 0.55619347, "learning_rate": 8.509e-05, "elapsed_time_per_iteration": 6.46669292, "memory(GiB)": 21.51, "elapsed_time": "3h 24m 35s", "remaining_time": "8h 12m 12s", "loss_scale": 1.0, "consumed_samples": 478208, "global_step/max_steps": "1868/6362"} +{"lm loss": 5.0377326, "grad_norm": 0.6303519, "learning_rate": 8.508e-05, "elapsed_time_per_iteration": 6.4193449, "memory(GiB)": 21.51, "elapsed_time": "3h 24m 42s", "remaining_time": "8h 12m 5s", "loss_scale": 1.0, "consumed_samples": 478464, "global_step/max_steps": "1869/6362"} +{"lm loss": 5.04951, "grad_norm": 0.67540807, "learning_rate": 8.506e-05, "elapsed_time_per_iteration": 6.46634245, "memory(GiB)": 21.51, "elapsed_time": "3h 24m 48s", "remaining_time": "8h 11m 58s", "loss_scale": 1.0, "consumed_samples": 478720, "global_step/max_steps": "1870/6362"} +{"lm loss": 5.04082966, "grad_norm": 0.73012346, "learning_rate": 8.504e-05, "elapsed_time_per_iteration": 6.37299538, "memory(GiB)": 21.51, "elapsed_time": "3h 24m 54s", "remaining_time": "8h 11m 51s", "loss_scale": 1.0, "consumed_samples": 478976, "global_step/max_steps": "1871/6362"} +{"lm loss": 5.01054192, "grad_norm": 0.77028579, "learning_rate": 8.502e-05, "elapsed_time_per_iteration": 6.47018147, "memory(GiB)": 21.51, "elapsed_time": "3h 25m 1s", "remaining_time": "8h 11m 45s", "loss_scale": 1.0, "consumed_samples": 479232, "global_step/max_steps": "1872/6362"} +{"lm loss": 5.04101849, "grad_norm": 0.7918871, "learning_rate": 8.5e-05, "elapsed_time_per_iteration": 6.42115593, "memory(GiB)": 21.51, "elapsed_time": "3h 25m 7s", "remaining_time": "8h 11m 38s", "loss_scale": 1.0, "consumed_samples": 479488, "global_step/max_steps": "1873/6362"} +{"lm loss": 5.05159569, "grad_norm": 0.75220096, "learning_rate": 8.498e-05, "elapsed_time_per_iteration": 6.94275165, "memory(GiB)": 21.51, "elapsed_time": "3h 25m 14s", "remaining_time": "8h 11m 32s", "loss_scale": 1.0, "consumed_samples": 479744, "global_step/max_steps": "1874/6362"} +{"lm loss": 5.04184723, "grad_norm": 0.75862479, "learning_rate": 8.497e-05, "elapsed_time_per_iteration": 6.56603622, "memory(GiB)": 21.51, "elapsed_time": "3h 25m 21s", "remaining_time": "8h 11m 25s", "loss_scale": 1.0, "consumed_samples": 480000, "global_step/max_steps": "1875/6362"} +{"lm loss": 5.03414965, "grad_norm": 0.75267231, "learning_rate": 8.495e-05, "elapsed_time_per_iteration": 6.77156663, "memory(GiB)": 21.51, "elapsed_time": "3h 25m 28s", "remaining_time": "8h 11m 19s", "loss_scale": 1.0, "consumed_samples": 480256, "global_step/max_steps": "1876/6362"} +{"lm loss": 5.04162455, "grad_norm": 0.6903947, "learning_rate": 8.493e-05, "elapsed_time_per_iteration": 6.41837358, "memory(GiB)": 21.51, "elapsed_time": "3h 25m 34s", "remaining_time": "8h 11m 12s", "loss_scale": 1.0, "consumed_samples": 480512, "global_step/max_steps": "1877/6362"} +{"lm loss": 5.03134584, "grad_norm": 0.73471457, "learning_rate": 8.491e-05, "elapsed_time_per_iteration": 6.54694867, "memory(GiB)": 21.51, "elapsed_time": "3h 25m 41s", "remaining_time": "8h 11m 6s", "loss_scale": 1.0, "consumed_samples": 480768, "global_step/max_steps": "1878/6362"} +{"lm loss": 5.02287769, "grad_norm": 0.74186862, "learning_rate": 8.489e-05, "elapsed_time_per_iteration": 6.39358306, "memory(GiB)": 21.51, "elapsed_time": "3h 25m 47s", "remaining_time": "8h 10m 59s", "loss_scale": 1.0, "consumed_samples": 481024, "global_step/max_steps": "1879/6362"} +{"lm loss": 5.0296402, "grad_norm": 0.71588051, "learning_rate": 8.487e-05, "elapsed_time_per_iteration": 6.4789598, "memory(GiB)": 21.51, "elapsed_time": "3h 25m 53s", "remaining_time": "8h 10m 52s", "loss_scale": 1.0, "consumed_samples": 481280, "global_step/max_steps": "1880/6362"} +{"lm loss": 5.03701591, "grad_norm": 0.77328259, "learning_rate": 8.486e-05, "elapsed_time_per_iteration": 6.49718857, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 0s", "remaining_time": "8h 10m 45s", "loss_scale": 1.0, "consumed_samples": 481536, "global_step/max_steps": "1881/6362"} +{"lm loss": 5.03770781, "grad_norm": 0.7656641, "learning_rate": 8.484e-05, "elapsed_time_per_iteration": 6.49661183, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 6s", "remaining_time": "8h 10m 38s", "loss_scale": 1.0, "consumed_samples": 481792, "global_step/max_steps": "1882/6362"} +{"lm loss": 5.03497791, "grad_norm": 1.68476105, "learning_rate": 8.482e-05, "elapsed_time_per_iteration": 6.46720314, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 13s", "remaining_time": "8h 10m 32s", "loss_scale": 1.0, "consumed_samples": 482048, "global_step/max_steps": "1883/6362"} +{"lm loss": 5.04439974, "grad_norm": 0.78585088, "learning_rate": 8.48e-05, "elapsed_time_per_iteration": 6.6390934, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 20s", "remaining_time": "8h 10m 25s", "loss_scale": 1.0, "consumed_samples": 482304, "global_step/max_steps": "1884/6362"} +{"lm loss": 5.04269791, "grad_norm": 1.03492486, "learning_rate": 8.478e-05, "elapsed_time_per_iteration": 6.36188865, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 26s", "remaining_time": "8h 10m 18s", "loss_scale": 1.0, "consumed_samples": 482560, "global_step/max_steps": "1885/6362"} +{"lm loss": 5.0701623, "grad_norm": 0.73647088, "learning_rate": 8.477e-05, "elapsed_time_per_iteration": 6.48860741, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 32s", "remaining_time": "8h 10m 11s", "loss_scale": 1.0, "consumed_samples": 482816, "global_step/max_steps": "1886/6362"} +{"lm loss": 5.0455389, "grad_norm": 0.92964244, "learning_rate": 8.475e-05, "elapsed_time_per_iteration": 6.75548887, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 39s", "remaining_time": "8h 10m 5s", "loss_scale": 1.0, "consumed_samples": 483072, "global_step/max_steps": "1887/6362"} +{"lm loss": 5.03112316, "grad_norm": 0.74302793, "learning_rate": 8.473e-05, "elapsed_time_per_iteration": 6.80510378, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 46s", "remaining_time": "8h 9m 59s", "loss_scale": 1.0, "consumed_samples": 483328, "global_step/max_steps": "1888/6362"} +{"lm loss": 5.02829885, "grad_norm": 0.80432576, "learning_rate": 8.471e-05, "elapsed_time_per_iteration": 6.59811449, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 53s", "remaining_time": "8h 9m 53s", "loss_scale": 1.0, "consumed_samples": 483584, "global_step/max_steps": "1889/6362"} +{"lm loss": 5.02646255, "grad_norm": 0.90093589, "learning_rate": 8.469e-05, "elapsed_time_per_iteration": 6.50812006, "memory(GiB)": 21.51, "elapsed_time": "3h 26m 59s", "remaining_time": "8h 9m 46s", "loss_scale": 1.0, "consumed_samples": 483840, "global_step/max_steps": "1890/6362"} +{"lm loss": 5.01892185, "grad_norm": 0.70622432, "learning_rate": 8.467e-05, "elapsed_time_per_iteration": 6.74378467, "memory(GiB)": 21.51, "elapsed_time": "3h 27m 6s", "remaining_time": "8h 9m 40s", "loss_scale": 1.0, "consumed_samples": 484096, "global_step/max_steps": "1891/6362"} +{"lm loss": 5.04017544, "grad_norm": 0.76871628, "learning_rate": 8.465e-05, "elapsed_time_per_iteration": 6.8597827, "memory(GiB)": 21.51, "elapsed_time": "3h 27m 13s", "remaining_time": "8h 9m 34s", "loss_scale": 1.0, "consumed_samples": 484352, "global_step/max_steps": "1892/6362"} +{"lm loss": 5.03001642, "grad_norm": 0.83208805, "learning_rate": 8.464e-05, "elapsed_time_per_iteration": 6.60534692, "memory(GiB)": 21.51, "elapsed_time": "3h 27m 19s", "remaining_time": "8h 9m 27s", "loss_scale": 1.0, "consumed_samples": 484608, "global_step/max_steps": "1893/6362"} +{"lm loss": 5.02788973, "grad_norm": 0.89642888, "learning_rate": 8.462e-05, "elapsed_time_per_iteration": 6.46628356, "memory(GiB)": 21.51, "elapsed_time": "3h 27m 26s", "remaining_time": "8h 9m 21s", "loss_scale": 1.0, "consumed_samples": 484864, "global_step/max_steps": "1894/6362"} +{"lm loss": 5.03235722, "grad_norm": 0.86188442, "learning_rate": 8.46e-05, "elapsed_time_per_iteration": 6.44956303, "memory(GiB)": 21.51, "elapsed_time": "3h 27m 32s", "remaining_time": "8h 9m 14s", "loss_scale": 1.0, "consumed_samples": 485120, "global_step/max_steps": "1895/6362"} +{"lm loss": 5.02202892, "grad_norm": 0.67185467, "learning_rate": 8.458e-05, "elapsed_time_per_iteration": 6.82285976, "memory(GiB)": 21.51, "elapsed_time": "3h 27m 39s", "remaining_time": "8h 9m 8s", "loss_scale": 1.0, "consumed_samples": 485376, "global_step/max_steps": "1896/6362"} +{"lm loss": 5.04681969, "grad_norm": 0.62489247, "learning_rate": 8.456e-05, "elapsed_time_per_iteration": 6.42091918, "memory(GiB)": 21.51, "elapsed_time": "3h 27m 45s", "remaining_time": "8h 9m 1s", "loss_scale": 1.0, "consumed_samples": 485632, "global_step/max_steps": "1897/6362"} +{"lm loss": 5.03426409, "grad_norm": 0.71659875, "learning_rate": 8.454e-05, "elapsed_time_per_iteration": 6.60250235, "memory(GiB)": 21.51, "elapsed_time": "3h 27m 52s", "remaining_time": "8h 8m 54s", "loss_scale": 1.0, "consumed_samples": 485888, "global_step/max_steps": "1898/6362"} +{"lm loss": 5.01578951, "grad_norm": 0.729532, "learning_rate": 8.453e-05, "elapsed_time_per_iteration": 6.54579544, "memory(GiB)": 21.51, "elapsed_time": "3h 27m 59s", "remaining_time": "8h 8m 48s", "loss_scale": 1.0, "consumed_samples": 486144, "global_step/max_steps": "1899/6362"} +{"lm loss": 5.0572958, "grad_norm": 0.63989276, "learning_rate": 8.451e-05, "elapsed_time_per_iteration": 6.4326961, "memory(GiB)": 21.51, "elapsed_time": "3h 28m 5s", "remaining_time": "8h 8m 41s", "loss_scale": 1.0, "consumed_samples": 486400, "global_step/max_steps": "1900/6362"} +{"lm loss": 5.02568245, "grad_norm": 0.63051391, "learning_rate": 8.449e-05, "elapsed_time_per_iteration": 6.24321914, "memory(GiB)": 21.51, "elapsed_time": "3h 28m 11s", "remaining_time": "8h 8m 33s", "loss_scale": 1.0, "consumed_samples": 486656, "global_step/max_steps": "1901/6362"} +{"lm loss": 5.03553057, "grad_norm": 0.67471147, "learning_rate": 8.447e-05, "elapsed_time_per_iteration": 6.63973522, "memory(GiB)": 21.51, "elapsed_time": "3h 28m 18s", "remaining_time": "8h 8m 27s", "loss_scale": 1.0, "consumed_samples": 486912, "global_step/max_steps": "1902/6362"} +{"lm loss": 5.05009985, "grad_norm": 0.69609994, "learning_rate": 8.445e-05, "elapsed_time_per_iteration": 6.53800511, "memory(GiB)": 21.51, "elapsed_time": "3h 28m 24s", "remaining_time": "8h 8m 20s", "loss_scale": 1.0, "consumed_samples": 487168, "global_step/max_steps": "1903/6362"} +{"lm loss": 5.04270983, "grad_norm": 0.6426965, "learning_rate": 8.443e-05, "elapsed_time_per_iteration": 6.44396186, "memory(GiB)": 21.51, "elapsed_time": "3h 28m 31s", "remaining_time": "8h 8m 13s", "loss_scale": 1.0, "consumed_samples": 487424, "global_step/max_steps": "1904/6362"} +{"lm loss": 5.00527525, "grad_norm": 0.58029765, "learning_rate": 8.441e-05, "elapsed_time_per_iteration": 6.46962857, "memory(GiB)": 21.51, "elapsed_time": "3h 28m 37s", "remaining_time": "8h 8m 7s", "loss_scale": 1.0, "consumed_samples": 487680, "global_step/max_steps": "1905/6362"} +{"lm loss": 5.02289629, "grad_norm": 0.59472305, "learning_rate": 8.44e-05, "elapsed_time_per_iteration": 6.61361146, "memory(GiB)": 21.51, "elapsed_time": "3h 28m 44s", "remaining_time": "8h 8m 0s", "loss_scale": 1.0, "consumed_samples": 487936, "global_step/max_steps": "1906/6362"} +{"lm loss": 5.03082991, "grad_norm": 0.71390742, "learning_rate": 8.438e-05, "elapsed_time_per_iteration": 6.59771442, "memory(GiB)": 21.51, "elapsed_time": "3h 28m 51s", "remaining_time": "8h 7m 54s", "loss_scale": 1.0, "consumed_samples": 488192, "global_step/max_steps": "1907/6362"} +{"lm loss": 5.04438686, "grad_norm": 0.72952926, "learning_rate": 8.436e-05, "elapsed_time_per_iteration": 6.60185099, "memory(GiB)": 21.51, "elapsed_time": "3h 28m 57s", "remaining_time": "8h 7m 47s", "loss_scale": 1.0, "consumed_samples": 488448, "global_step/max_steps": "1908/6362"} +{"lm loss": 5.03969431, "grad_norm": 0.72672647, "learning_rate": 8.434e-05, "elapsed_time_per_iteration": 6.5974946, "memory(GiB)": 21.51, "elapsed_time": "3h 29m 4s", "remaining_time": "8h 7m 41s", "loss_scale": 1.0, "consumed_samples": 488704, "global_step/max_steps": "1909/6362"} +{"lm loss": 5.02778721, "grad_norm": 0.77760482, "learning_rate": 8.432e-05, "elapsed_time_per_iteration": 6.51123643, "memory(GiB)": 21.51, "elapsed_time": "3h 29m 10s", "remaining_time": "8h 7m 34s", "loss_scale": 1.0, "consumed_samples": 488960, "global_step/max_steps": "1910/6362"} +{"lm loss": 5.04160261, "grad_norm": 0.77289337, "learning_rate": 8.43e-05, "elapsed_time_per_iteration": 6.65825295, "memory(GiB)": 21.51, "elapsed_time": "3h 29m 17s", "remaining_time": "8h 7m 28s", "loss_scale": 1.0, "consumed_samples": 489216, "global_step/max_steps": "1911/6362"} +{"lm loss": 5.06619883, "grad_norm": 0.81343997, "learning_rate": 8.429e-05, "elapsed_time_per_iteration": 6.4691329, "memory(GiB)": 21.51, "elapsed_time": "3h 29m 23s", "remaining_time": "8h 7m 21s", "loss_scale": 1.0, "consumed_samples": 489472, "global_step/max_steps": "1912/6362"} +{"lm loss": 5.03168869, "grad_norm": 0.93300855, "learning_rate": 8.427e-05, "elapsed_time_per_iteration": 6.49592304, "memory(GiB)": 21.51, "elapsed_time": "3h 29m 30s", "remaining_time": "8h 7m 14s", "loss_scale": 1.0, "consumed_samples": 489728, "global_step/max_steps": "1913/6362"} +{"lm loss": 5.02829695, "grad_norm": 0.83951104, "learning_rate": 8.425e-05, "elapsed_time_per_iteration": 6.53400826, "memory(GiB)": 21.51, "elapsed_time": "3h 29m 36s", "remaining_time": "8h 7m 7s", "loss_scale": 1.0, "consumed_samples": 489984, "global_step/max_steps": "1914/6362"} +{"lm loss": 5.04006243, "grad_norm": 0.84438324, "learning_rate": 8.423e-05, "elapsed_time_per_iteration": 6.92423415, "memory(GiB)": 21.51, "elapsed_time": "3h 29m 43s", "remaining_time": "8h 7m 2s", "loss_scale": 1.0, "consumed_samples": 490240, "global_step/max_steps": "1915/6362"} +{"lm loss": 5.04091501, "grad_norm": 0.97039181, "learning_rate": 8.421e-05, "elapsed_time_per_iteration": 6.76287675, "memory(GiB)": 21.51, "elapsed_time": "3h 29m 50s", "remaining_time": "8h 6m 56s", "loss_scale": 1.0, "consumed_samples": 490496, "global_step/max_steps": "1916/6362"} +{"lm loss": 5.04209185, "grad_norm": 0.86805332, "learning_rate": 8.419e-05, "elapsed_time_per_iteration": 6.43257666, "memory(GiB)": 21.51, "elapsed_time": "3h 29m 57s", "remaining_time": "8h 6m 49s", "loss_scale": 1.0, "consumed_samples": 490752, "global_step/max_steps": "1917/6362"} +{"lm loss": 5.01055431, "grad_norm": 0.72757906, "learning_rate": 8.417e-05, "elapsed_time_per_iteration": 6.63284039, "memory(GiB)": 21.51, "elapsed_time": "3h 30m 3s", "remaining_time": "8h 6m 42s", "loss_scale": 1.0, "consumed_samples": 491008, "global_step/max_steps": "1918/6362"} +{"lm loss": 5.0299983, "grad_norm": 0.70149863, "learning_rate": 8.415e-05, "elapsed_time_per_iteration": 6.69614983, "memory(GiB)": 21.51, "elapsed_time": "3h 30m 10s", "remaining_time": "8h 6m 36s", "loss_scale": 1.0, "consumed_samples": 491264, "global_step/max_steps": "1919/6362"} +{"lm loss": 5.03343487, "grad_norm": 0.72455555, "learning_rate": 8.414e-05, "elapsed_time_per_iteration": 6.50243711, "memory(GiB)": 21.51, "elapsed_time": "3h 30m 16s", "remaining_time": "8h 6m 29s", "loss_scale": 1.0, "consumed_samples": 491520, "global_step/max_steps": "1920/6362"} +{"lm loss": 5.04577684, "grad_norm": 0.67577696, "learning_rate": 8.412e-05, "elapsed_time_per_iteration": 6.62700677, "memory(GiB)": 21.51, "elapsed_time": "3h 30m 23s", "remaining_time": "8h 6m 23s", "loss_scale": 1.0, "consumed_samples": 491776, "global_step/max_steps": "1921/6362"} +{"lm loss": 5.01132679, "grad_norm": 0.64083302, "learning_rate": 8.41e-05, "elapsed_time_per_iteration": 6.68518806, "memory(GiB)": 21.51, "elapsed_time": "3h 30m 30s", "remaining_time": "8h 6m 16s", "loss_scale": 1.0, "consumed_samples": 492032, "global_step/max_steps": "1922/6362"} +{"lm loss": 5.05027103, "grad_norm": 0.66116613, "learning_rate": 8.408e-05, "elapsed_time_per_iteration": 6.81098533, "memory(GiB)": 21.51, "elapsed_time": "3h 30m 37s", "remaining_time": "8h 6m 10s", "loss_scale": 1.0, "consumed_samples": 492288, "global_step/max_steps": "1923/6362"} +{"lm loss": 5.04069138, "grad_norm": 0.69348186, "learning_rate": 8.406e-05, "elapsed_time_per_iteration": 6.64851069, "memory(GiB)": 21.51, "elapsed_time": "3h 30m 43s", "remaining_time": "8h 6m 4s", "loss_scale": 1.0, "consumed_samples": 492544, "global_step/max_steps": "1924/6362"} +{"lm loss": 5.03088188, "grad_norm": 0.68428451, "learning_rate": 8.404e-05, "elapsed_time_per_iteration": 6.48012185, "memory(GiB)": 21.51, "elapsed_time": "3h 30m 50s", "remaining_time": "8h 5m 57s", "loss_scale": 1.0, "consumed_samples": 492800, "global_step/max_steps": "1925/6362"} +{"lm loss": 5.0146575, "grad_norm": 0.74426347, "learning_rate": 8.402e-05, "elapsed_time_per_iteration": 6.5934329, "memory(GiB)": 21.51, "elapsed_time": "3h 30m 56s", "remaining_time": "8h 5m 51s", "loss_scale": 1.0, "consumed_samples": 493056, "global_step/max_steps": "1926/6362"} +{"lm loss": 5.01045752, "grad_norm": 0.90885049, "learning_rate": 8.401e-05, "elapsed_time_per_iteration": 6.51539803, "memory(GiB)": 21.51, "elapsed_time": "3h 31m 3s", "remaining_time": "8h 5m 44s", "loss_scale": 1.0, "consumed_samples": 493312, "global_step/max_steps": "1927/6362"} +{"lm loss": 5.03586388, "grad_norm": 0.87745631, "learning_rate": 8.399e-05, "elapsed_time_per_iteration": 6.34434271, "memory(GiB)": 21.51, "elapsed_time": "3h 31m 9s", "remaining_time": "8h 5m 37s", "loss_scale": 1.0, "consumed_samples": 493568, "global_step/max_steps": "1928/6362"} +{"lm loss": 5.03946066, "grad_norm": 0.83251292, "learning_rate": 8.397e-05, "elapsed_time_per_iteration": 6.47927475, "memory(GiB)": 21.51, "elapsed_time": "3h 31m 16s", "remaining_time": "8h 5m 30s", "loss_scale": 1.0, "consumed_samples": 493824, "global_step/max_steps": "1929/6362"} +{"lm loss": 5.02575397, "grad_norm": 0.68526566, "learning_rate": 8.395e-05, "elapsed_time_per_iteration": 6.56318998, "memory(GiB)": 21.51, "elapsed_time": "3h 31m 22s", "remaining_time": "8h 5m 24s", "loss_scale": 1.0, "consumed_samples": 494080, "global_step/max_steps": "1930/6362"} +{"lm loss": 5.02727556, "grad_norm": 0.57920098, "learning_rate": 8.393e-05, "elapsed_time_per_iteration": 6.48463988, "memory(GiB)": 21.51, "elapsed_time": "3h 31m 29s", "remaining_time": "8h 5m 17s", "loss_scale": 1.0, "consumed_samples": 494336, "global_step/max_steps": "1931/6362"} +{"lm loss": 5.03178167, "grad_norm": 0.65814465, "learning_rate": 8.391e-05, "elapsed_time_per_iteration": 6.52930069, "memory(GiB)": 21.51, "elapsed_time": "3h 31m 35s", "remaining_time": "8h 5m 10s", "loss_scale": 1.0, "consumed_samples": 494592, "global_step/max_steps": "1932/6362"} +{"lm loss": 5.02219963, "grad_norm": 0.70827782, "learning_rate": 8.389e-05, "elapsed_time_per_iteration": 6.49101877, "memory(GiB)": 21.51, "elapsed_time": "3h 31m 42s", "remaining_time": "8h 5m 3s", "loss_scale": 1.0, "consumed_samples": 494848, "global_step/max_steps": "1933/6362"} +{"lm loss": 5.03698492, "grad_norm": 0.74704391, "learning_rate": 8.387e-05, "elapsed_time_per_iteration": 6.66520882, "memory(GiB)": 21.51, "elapsed_time": "3h 31m 48s", "remaining_time": "8h 4m 57s", "loss_scale": 1.0, "consumed_samples": 495104, "global_step/max_steps": "1934/6362"} +{"lm loss": 5.05065489, "grad_norm": 0.68680972, "learning_rate": 8.386e-05, "elapsed_time_per_iteration": 6.42550635, "memory(GiB)": 21.51, "elapsed_time": "3h 31m 55s", "remaining_time": "8h 4m 50s", "loss_scale": 1.0, "consumed_samples": 495360, "global_step/max_steps": "1935/6362"} +{"lm loss": 5.03201485, "grad_norm": 0.63562238, "learning_rate": 8.384e-05, "elapsed_time_per_iteration": 6.55051088, "memory(GiB)": 21.51, "elapsed_time": "3h 32m 1s", "remaining_time": "8h 4m 43s", "loss_scale": 1.0, "consumed_samples": 495616, "global_step/max_steps": "1936/6362"} +{"lm loss": 5.01679325, "grad_norm": 0.67359972, "learning_rate": 8.382e-05, "elapsed_time_per_iteration": 6.46328688, "memory(GiB)": 21.51, "elapsed_time": "3h 32m 8s", "remaining_time": "8h 4m 37s", "loss_scale": 1.0, "consumed_samples": 495872, "global_step/max_steps": "1937/6362"} +{"lm loss": 5.02397299, "grad_norm": 0.68423074, "learning_rate": 8.38e-05, "elapsed_time_per_iteration": 6.45725346, "memory(GiB)": 21.51, "elapsed_time": "3h 32m 14s", "remaining_time": "8h 4m 30s", "loss_scale": 1.0, "consumed_samples": 496128, "global_step/max_steps": "1938/6362"} +{"lm loss": 5.02611494, "grad_norm": 0.70495868, "learning_rate": 8.378e-05, "elapsed_time_per_iteration": 6.49880695, "memory(GiB)": 21.51, "elapsed_time": "3h 32m 21s", "remaining_time": "8h 4m 23s", "loss_scale": 1.0, "consumed_samples": 496384, "global_step/max_steps": "1939/6362"} +{"lm loss": 5.04427195, "grad_norm": 0.73971814, "learning_rate": 8.376e-05, "elapsed_time_per_iteration": 6.44197106, "memory(GiB)": 21.51, "elapsed_time": "3h 32m 27s", "remaining_time": "8h 4m 16s", "loss_scale": 1.0, "consumed_samples": 496640, "global_step/max_steps": "1940/6362"} +{"lm loss": 5.01971292, "grad_norm": 0.60400259, "learning_rate": 8.374e-05, "elapsed_time_per_iteration": 6.82754326, "memory(GiB)": 21.51, "elapsed_time": "3h 32m 34s", "remaining_time": "8h 4m 10s", "loss_scale": 1.0, "consumed_samples": 496896, "global_step/max_steps": "1941/6362"} +{"lm loss": 5.01148844, "grad_norm": 0.53428864, "learning_rate": 8.372e-05, "elapsed_time_per_iteration": 6.52322865, "memory(GiB)": 21.51, "elapsed_time": "3h 32m 40s", "remaining_time": "8h 4m 4s", "loss_scale": 1.0, "consumed_samples": 497152, "global_step/max_steps": "1942/6362"} +{"lm loss": 5.01262903, "grad_norm": 0.56112927, "learning_rate": 8.37e-05, "elapsed_time_per_iteration": 6.51236439, "memory(GiB)": 21.51, "elapsed_time": "3h 32m 47s", "remaining_time": "8h 3m 57s", "loss_scale": 1.0, "consumed_samples": 497408, "global_step/max_steps": "1943/6362"} +{"lm loss": 5.02677822, "grad_norm": 0.60371256, "learning_rate": 8.369e-05, "elapsed_time_per_iteration": 6.53353167, "memory(GiB)": 21.51, "elapsed_time": "3h 32m 54s", "remaining_time": "8h 3m 50s", "loss_scale": 1.0, "consumed_samples": 497664, "global_step/max_steps": "1944/6362"} +{"lm loss": 5.03155041, "grad_norm": 0.72038585, "learning_rate": 8.367e-05, "elapsed_time_per_iteration": 6.5050199, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 0s", "remaining_time": "8h 3m 43s", "loss_scale": 1.0, "consumed_samples": 497920, "global_step/max_steps": "1945/6362"} +{"lm loss": 5.03013611, "grad_norm": 0.76958501, "learning_rate": 8.365e-05, "elapsed_time_per_iteration": 6.41679621, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 6s", "remaining_time": "8h 3m 37s", "loss_scale": 1.0, "consumed_samples": 498176, "global_step/max_steps": "1946/6362"} +{"lm loss": 5.03872108, "grad_norm": 0.72365862, "learning_rate": 8.363e-05, "elapsed_time_per_iteration": 6.76039672, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 13s", "remaining_time": "8h 3m 30s", "loss_scale": 1.0, "consumed_samples": 498432, "global_step/max_steps": "1947/6362"} +{"lm loss": 5.04185534, "grad_norm": 0.77008253, "learning_rate": 8.361e-05, "elapsed_time_per_iteration": 6.33743763, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 20s", "remaining_time": "8h 3m 23s", "loss_scale": 1.0, "consumed_samples": 498688, "global_step/max_steps": "1948/6362"} +{"lm loss": 5.0294323, "grad_norm": 0.84808856, "learning_rate": 8.359e-05, "elapsed_time_per_iteration": 6.65931058, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 26s", "remaining_time": "8h 3m 17s", "loss_scale": 1.0, "consumed_samples": 498944, "global_step/max_steps": "1949/6362"} +{"lm loss": 5.03454685, "grad_norm": 0.82823694, "learning_rate": 8.357e-05, "elapsed_time_per_iteration": 6.47165442, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 33s", "remaining_time": "8h 3m 10s", "loss_scale": 1.0, "consumed_samples": 499200, "global_step/max_steps": "1950/6362"} +{"lm loss": 5.02378702, "grad_norm": 0.68137723, "learning_rate": 8.355e-05, "elapsed_time_per_iteration": 6.55378485, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 39s", "remaining_time": "8h 3m 4s", "loss_scale": 1.0, "consumed_samples": 499456, "global_step/max_steps": "1951/6362"} +{"lm loss": 4.99439001, "grad_norm": 0.69798994, "learning_rate": 8.353e-05, "elapsed_time_per_iteration": 6.62055206, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 46s", "remaining_time": "8h 2m 57s", "loss_scale": 1.0, "consumed_samples": 499712, "global_step/max_steps": "1952/6362"} +{"lm loss": 5.03958464, "grad_norm": 0.72472966, "learning_rate": 8.352e-05, "elapsed_time_per_iteration": 6.82740474, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 53s", "remaining_time": "8h 2m 51s", "loss_scale": 1.0, "consumed_samples": 499968, "global_step/max_steps": "1953/6362"} +{"lm loss": 5.028687, "grad_norm": 0.75229996, "learning_rate": 8.35e-05, "elapsed_time_per_iteration": 6.62408566, "memory(GiB)": 21.51, "elapsed_time": "3h 33m 59s", "remaining_time": "8h 2m 45s", "loss_scale": 1.0, "consumed_samples": 500224, "global_step/max_steps": "1954/6362"} +{"lm loss": 5.02517796, "grad_norm": 0.7664392, "learning_rate": 8.348e-05, "elapsed_time_per_iteration": 6.97503138, "memory(GiB)": 21.51, "elapsed_time": "3h 34m 6s", "remaining_time": "8h 2m 39s", "loss_scale": 1.0, "consumed_samples": 500480, "global_step/max_steps": "1955/6362"} +{"lm loss": 5.00874949, "grad_norm": 0.65703356, "learning_rate": 8.346e-05, "elapsed_time_per_iteration": 6.60047245, "memory(GiB)": 21.51, "elapsed_time": "3h 34m 13s", "remaining_time": "8h 2m 32s", "loss_scale": 1.0, "consumed_samples": 500736, "global_step/max_steps": "1956/6362"} +{"lm loss": 5.05454111, "grad_norm": 0.64480996, "learning_rate": 8.344e-05, "elapsed_time_per_iteration": 7.06155396, "memory(GiB)": 21.51, "elapsed_time": "3h 34m 20s", "remaining_time": "8h 2m 27s", "loss_scale": 1.0, "consumed_samples": 500992, "global_step/max_steps": "1957/6362"} +{"lm loss": 5.03858328, "grad_norm": 0.59008861, "learning_rate": 8.342e-05, "elapsed_time_per_iteration": 6.39399695, "memory(GiB)": 21.51, "elapsed_time": "3h 34m 26s", "remaining_time": "8h 2m 20s", "loss_scale": 1.0, "consumed_samples": 501248, "global_step/max_steps": "1958/6362"} +{"lm loss": 5.02581882, "grad_norm": 0.64030397, "learning_rate": 8.34e-05, "elapsed_time_per_iteration": 6.6411643, "memory(GiB)": 21.51, "elapsed_time": "3h 34m 33s", "remaining_time": "8h 2m 14s", "loss_scale": 1.0, "consumed_samples": 501504, "global_step/max_steps": "1959/6362"} +{"lm loss": 5.03378534, "grad_norm": 0.64608335, "learning_rate": 8.338e-05, "elapsed_time_per_iteration": 6.74525523, "memory(GiB)": 21.51, "elapsed_time": "3h 34m 40s", "remaining_time": "8h 2m 7s", "loss_scale": 1.0, "consumed_samples": 501760, "global_step/max_steps": "1960/6362"} +{"lm loss": 5.00980377, "grad_norm": 0.63927633, "learning_rate": 8.336e-05, "elapsed_time_per_iteration": 6.56057143, "memory(GiB)": 21.51, "elapsed_time": "3h 34m 46s", "remaining_time": "8h 2m 1s", "loss_scale": 1.0, "consumed_samples": 502016, "global_step/max_steps": "1961/6362"} +{"lm loss": 5.0281477, "grad_norm": 0.7092576, "learning_rate": 8.335e-05, "elapsed_time_per_iteration": 6.51776147, "memory(GiB)": 21.51, "elapsed_time": "3h 34m 53s", "remaining_time": "8h 1m 54s", "loss_scale": 1.0, "consumed_samples": 502272, "global_step/max_steps": "1962/6362"} +{"lm loss": 5.0204854, "grad_norm": 0.67356926, "learning_rate": 8.333e-05, "elapsed_time_per_iteration": 6.76051331, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 0s", "remaining_time": "8h 1m 48s", "loss_scale": 1.0, "consumed_samples": 502528, "global_step/max_steps": "1963/6362"} +{"lm loss": 5.01832628, "grad_norm": 0.709894, "learning_rate": 8.331e-05, "elapsed_time_per_iteration": 6.75741148, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 6s", "remaining_time": "8h 1m 42s", "loss_scale": 1.0, "consumed_samples": 502784, "global_step/max_steps": "1964/6362"} +{"lm loss": 5.04481506, "grad_norm": 0.84919125, "learning_rate": 8.329e-05, "elapsed_time_per_iteration": 6.62546372, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 13s", "remaining_time": "8h 1m 35s", "loss_scale": 1.0, "consumed_samples": 503040, "global_step/max_steps": "1965/6362"} +{"lm loss": 5.0185194, "grad_norm": 1.13967037, "learning_rate": 8.327e-05, "elapsed_time_per_iteration": 6.77908778, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 20s", "remaining_time": "8h 1m 29s", "loss_scale": 1.0, "consumed_samples": 503296, "global_step/max_steps": "1966/6362"} +{"lm loss": 5.05682659, "grad_norm": 0.84456706, "learning_rate": 8.325e-05, "elapsed_time_per_iteration": 6.50336409, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 26s", "remaining_time": "8h 1m 23s", "loss_scale": 1.0, "consumed_samples": 503552, "global_step/max_steps": "1967/6362"} +{"lm loss": 5.03001833, "grad_norm": 0.78270233, "learning_rate": 8.323e-05, "elapsed_time_per_iteration": 6.53802323, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 33s", "remaining_time": "8h 1m 16s", "loss_scale": 1.0, "consumed_samples": 503808, "global_step/max_steps": "1968/6362"} +{"lm loss": 5.0243063, "grad_norm": 0.83277768, "learning_rate": 8.321e-05, "elapsed_time_per_iteration": 6.5850594, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 39s", "remaining_time": "8h 1m 9s", "loss_scale": 1.0, "consumed_samples": 504064, "global_step/max_steps": "1969/6362"} +{"lm loss": 4.98593521, "grad_norm": 0.92911106, "learning_rate": 8.319e-05, "elapsed_time_per_iteration": 6.46228194, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 46s", "remaining_time": "8h 1m 3s", "loss_scale": 1.0, "consumed_samples": 504320, "global_step/max_steps": "1970/6362"} +{"lm loss": 5.00994396, "grad_norm": 0.96096373, "learning_rate": 8.317e-05, "elapsed_time_per_iteration": 6.95009017, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 53s", "remaining_time": "8h 0m 57s", "loss_scale": 1.0, "consumed_samples": 504576, "global_step/max_steps": "1971/6362"} +{"lm loss": 5.00179386, "grad_norm": 0.73305225, "learning_rate": 8.315e-05, "elapsed_time_per_iteration": 6.48420882, "memory(GiB)": 21.51, "elapsed_time": "3h 35m 59s", "remaining_time": "8h 0m 50s", "loss_scale": 1.0, "consumed_samples": 504832, "global_step/max_steps": "1972/6362"} +{"lm loss": 5.04543829, "grad_norm": 0.64854395, "learning_rate": 8.314e-05, "elapsed_time_per_iteration": 6.66589832, "memory(GiB)": 21.51, "elapsed_time": "3h 36m 6s", "remaining_time": "8h 0m 44s", "loss_scale": 1.0, "consumed_samples": 505088, "global_step/max_steps": "1973/6362"} +{"lm loss": 5.03476906, "grad_norm": 0.71991712, "learning_rate": 8.312e-05, "elapsed_time_per_iteration": 6.75637865, "memory(GiB)": 21.51, "elapsed_time": "3h 36m 13s", "remaining_time": "8h 0m 38s", "loss_scale": 1.0, "consumed_samples": 505344, "global_step/max_steps": "1974/6362"} +{"lm loss": 5.02618408, "grad_norm": 0.76185995, "learning_rate": 8.31e-05, "elapsed_time_per_iteration": 6.52871203, "memory(GiB)": 21.51, "elapsed_time": "3h 36m 19s", "remaining_time": "8h 0m 31s", "loss_scale": 1.0, "consumed_samples": 505600, "global_step/max_steps": "1975/6362"} +{"lm loss": 5.04248476, "grad_norm": 0.7255705, "learning_rate": 8.308e-05, "elapsed_time_per_iteration": 6.51348829, "memory(GiB)": 21.51, "elapsed_time": "3h 36m 26s", "remaining_time": "8h 0m 24s", "loss_scale": 1.0, "consumed_samples": 505856, "global_step/max_steps": "1976/6362"} +{"lm loss": 5.016397, "grad_norm": 0.69826221, "learning_rate": 8.306e-05, "elapsed_time_per_iteration": 6.6361661, "memory(GiB)": 21.51, "elapsed_time": "3h 36m 32s", "remaining_time": "8h 0m 18s", "loss_scale": 1.0, "consumed_samples": 506112, "global_step/max_steps": "1977/6362"} +{"lm loss": 5.02295923, "grad_norm": 0.6291666, "learning_rate": 8.304e-05, "elapsed_time_per_iteration": 6.53963113, "memory(GiB)": 21.51, "elapsed_time": "3h 36m 39s", "remaining_time": "8h 0m 11s", "loss_scale": 1.0, "consumed_samples": 506368, "global_step/max_steps": "1978/6362"} +{"lm loss": 5.02045918, "grad_norm": 0.5829888, "learning_rate": 8.302e-05, "elapsed_time_per_iteration": 6.68168116, "memory(GiB)": 21.51, "elapsed_time": "3h 36m 46s", "remaining_time": "8h 0m 5s", "loss_scale": 1.0, "consumed_samples": 506624, "global_step/max_steps": "1979/6362"} +{"lm loss": 5.00754786, "grad_norm": 0.66276026, "learning_rate": 8.3e-05, "elapsed_time_per_iteration": 6.61624599, "memory(GiB)": 21.51, "elapsed_time": "3h 36m 52s", "remaining_time": "7h 59m 58s", "loss_scale": 1.0, "consumed_samples": 506880, "global_step/max_steps": "1980/6362"} +{"lm loss": 5.01461458, "grad_norm": 0.64401346, "learning_rate": 8.298e-05, "elapsed_time_per_iteration": 6.70245767, "memory(GiB)": 21.51, "elapsed_time": "3h 36m 59s", "remaining_time": "7h 59m 52s", "loss_scale": 1.0, "consumed_samples": 507136, "global_step/max_steps": "1981/6362"} +{"lm loss": 5.01545095, "grad_norm": 0.64540851, "learning_rate": 8.296e-05, "elapsed_time_per_iteration": 6.60766172, "memory(GiB)": 21.51, "elapsed_time": "3h 37m 6s", "remaining_time": "7h 59m 46s", "loss_scale": 1.0, "consumed_samples": 507392, "global_step/max_steps": "1982/6362"} +{"lm loss": 5.00321293, "grad_norm": 0.56466025, "learning_rate": 8.294e-05, "elapsed_time_per_iteration": 6.44182682, "memory(GiB)": 21.51, "elapsed_time": "3h 37m 12s", "remaining_time": "7h 59m 39s", "loss_scale": 1.0, "consumed_samples": 507648, "global_step/max_steps": "1983/6362"} +{"lm loss": 5.0178566, "grad_norm": 0.66483617, "learning_rate": 8.292e-05, "elapsed_time_per_iteration": 6.70257878, "memory(GiB)": 21.51, "elapsed_time": "3h 37m 19s", "remaining_time": "7h 59m 32s", "loss_scale": 1.0, "consumed_samples": 507904, "global_step/max_steps": "1984/6362"} +{"lm loss": 5.02065945, "grad_norm": 0.62975562, "learning_rate": 8.291e-05, "elapsed_time_per_iteration": 6.32524061, "memory(GiB)": 21.51, "elapsed_time": "3h 37m 25s", "remaining_time": "7h 59m 25s", "loss_scale": 1.0, "consumed_samples": 508160, "global_step/max_steps": "1985/6362"} +{"lm loss": 5.05420542, "grad_norm": 0.66641343, "learning_rate": 8.289e-05, "elapsed_time_per_iteration": 6.45822072, "memory(GiB)": 21.51, "elapsed_time": "3h 37m 31s", "remaining_time": "7h 59m 18s", "loss_scale": 1.0, "consumed_samples": 508416, "global_step/max_steps": "1986/6362"} +{"lm loss": 5.03133774, "grad_norm": 0.66374773, "learning_rate": 8.287e-05, "elapsed_time_per_iteration": 6.52806926, "memory(GiB)": 21.51, "elapsed_time": "3h 37m 38s", "remaining_time": "7h 59m 12s", "loss_scale": 1.0, "consumed_samples": 508672, "global_step/max_steps": "1987/6362"} +{"lm loss": 5.03618526, "grad_norm": 0.59337157, "learning_rate": 8.285e-05, "elapsed_time_per_iteration": 6.4542768, "memory(GiB)": 21.51, "elapsed_time": "3h 37m 44s", "remaining_time": "7h 59m 5s", "loss_scale": 1.0, "consumed_samples": 508928, "global_step/max_steps": "1988/6362"} +{"lm loss": 5.01839209, "grad_norm": 0.63229299, "learning_rate": 8.283e-05, "elapsed_time_per_iteration": 6.52014804, "memory(GiB)": 21.51, "elapsed_time": "3h 37m 51s", "remaining_time": "7h 58m 58s", "loss_scale": 1.0, "consumed_samples": 509184, "global_step/max_steps": "1989/6362"} +{"lm loss": 5.05617952, "grad_norm": 0.80871409, "learning_rate": 8.281e-05, "elapsed_time_per_iteration": 6.68191767, "memory(GiB)": 21.51, "elapsed_time": "3h 37m 58s", "remaining_time": "7h 58m 52s", "loss_scale": 1.0, "consumed_samples": 509440, "global_step/max_steps": "1990/6362"} +{"lm loss": 5.0072217, "grad_norm": 0.76705015, "learning_rate": 8.279e-05, "elapsed_time_per_iteration": 7.39211655, "memory(GiB)": 21.51, "elapsed_time": "3h 38m 5s", "remaining_time": "7h 58m 47s", "loss_scale": 1.0, "consumed_samples": 509696, "global_step/max_steps": "1991/6362"} +{"lm loss": 5.01437235, "grad_norm": 0.60371512, "learning_rate": 8.277e-05, "elapsed_time_per_iteration": 6.60715532, "memory(GiB)": 21.51, "elapsed_time": "3h 38m 12s", "remaining_time": "7h 58m 41s", "loss_scale": 1.0, "consumed_samples": 509952, "global_step/max_steps": "1992/6362"} +{"lm loss": 5.02442646, "grad_norm": 0.7472074, "learning_rate": 8.275e-05, "elapsed_time_per_iteration": 6.34316063, "memory(GiB)": 21.51, "elapsed_time": "3h 38m 18s", "remaining_time": "7h 58m 34s", "loss_scale": 1.0, "consumed_samples": 510208, "global_step/max_steps": "1993/6362"} +{"lm loss": 5.04874992, "grad_norm": 0.75620824, "learning_rate": 8.273e-05, "elapsed_time_per_iteration": 6.37884712, "memory(GiB)": 21.51, "elapsed_time": "3h 38m 24s", "remaining_time": "7h 58m 27s", "loss_scale": 1.0, "consumed_samples": 510464, "global_step/max_steps": "1994/6362"} +{"lm loss": 5.04543161, "grad_norm": 0.75966865, "learning_rate": 8.271e-05, "elapsed_time_per_iteration": 6.36945581, "memory(GiB)": 21.51, "elapsed_time": "3h 38m 31s", "remaining_time": "7h 58m 20s", "loss_scale": 1.0, "consumed_samples": 510720, "global_step/max_steps": "1995/6362"} +{"lm loss": 4.99071836, "grad_norm": 0.88815713, "learning_rate": 8.269e-05, "elapsed_time_per_iteration": 6.26478028, "memory(GiB)": 21.51, "elapsed_time": "3h 38m 37s", "remaining_time": "7h 58m 12s", "loss_scale": 1.0, "consumed_samples": 510976, "global_step/max_steps": "1996/6362"} +{"lm loss": 5.0385437, "grad_norm": 1.03732157, "learning_rate": 8.267e-05, "elapsed_time_per_iteration": 6.39351058, "memory(GiB)": 21.51, "elapsed_time": "3h 38m 43s", "remaining_time": "7h 58m 5s", "loss_scale": 1.0, "consumed_samples": 511232, "global_step/max_steps": "1997/6362"} +{"lm loss": 5.01495838, "grad_norm": 0.9004842, "learning_rate": 8.266e-05, "elapsed_time_per_iteration": 6.5477562, "memory(GiB)": 21.51, "elapsed_time": "3h 38m 50s", "remaining_time": "7h 57m 59s", "loss_scale": 1.0, "consumed_samples": 511488, "global_step/max_steps": "1998/6362"} +{"lm loss": 5.03935814, "grad_norm": 0.84079361, "learning_rate": 8.264e-05, "elapsed_time_per_iteration": 6.77077293, "memory(GiB)": 21.51, "elapsed_time": "3h 38m 57s", "remaining_time": "7h 57m 53s", "loss_scale": 1.0, "consumed_samples": 511744, "global_step/max_steps": "1999/6362"} +{"lm loss": 5.01461935, "grad_norm": 0.83679014, "learning_rate": 8.262e-05, "elapsed_time_per_iteration": 6.81429529, "memory(GiB)": 21.51, "elapsed_time": "3h 39m 4s", "remaining_time": "7h 57m 47s", "loss_scale": 1.0, "consumed_samples": 512000, "global_step/max_steps": "2000/6362"} +{"lm loss": 5.02198076, "grad_norm": 0.9421621, "learning_rate": 8.26e-05, "elapsed_time_per_iteration": 6.54491091, "memory(GiB)": 21.51, "elapsed_time": "3h 39m 10s", "remaining_time": "7h 57m 40s", "loss_scale": 1.0, "consumed_samples": 512256, "global_step/max_steps": "2001/6362"} +{"lm loss": 5.05314493, "grad_norm": 0.88049245, "learning_rate": 8.258e-05, "elapsed_time_per_iteration": 6.54177904, "memory(GiB)": 21.51, "elapsed_time": "3h 39m 17s", "remaining_time": "7h 57m 33s", "loss_scale": 1.0, "consumed_samples": 512512, "global_step/max_steps": "2002/6362"} +{"lm loss": 5.02023745, "grad_norm": 0.73491591, "learning_rate": 8.256e-05, "elapsed_time_per_iteration": 6.57139087, "memory(GiB)": 21.51, "elapsed_time": "3h 39m 23s", "remaining_time": "7h 57m 27s", "loss_scale": 1.0, "consumed_samples": 512768, "global_step/max_steps": "2003/6362"} +{"lm loss": 5.03378391, "grad_norm": 0.77708405, "learning_rate": 8.254e-05, "elapsed_time_per_iteration": 6.73619986, "memory(GiB)": 21.51, "elapsed_time": "3h 39m 30s", "remaining_time": "7h 57m 21s", "loss_scale": 1.0, "consumed_samples": 513024, "global_step/max_steps": "2004/6362"} +{"lm loss": 5.01245499, "grad_norm": 0.71789724, "learning_rate": 8.252e-05, "elapsed_time_per_iteration": 6.4795115, "memory(GiB)": 21.51, "elapsed_time": "3h 39m 36s", "remaining_time": "7h 57m 14s", "loss_scale": 1.0, "consumed_samples": 513280, "global_step/max_steps": "2005/6362"} +{"lm loss": 5.00330734, "grad_norm": 0.67977285, "learning_rate": 8.25e-05, "elapsed_time_per_iteration": 6.61147809, "memory(GiB)": 21.51, "elapsed_time": "3h 39m 43s", "remaining_time": "7h 57m 7s", "loss_scale": 1.0, "consumed_samples": 513536, "global_step/max_steps": "2006/6362"} +{"lm loss": 5.01832771, "grad_norm": 0.71620667, "learning_rate": 8.248e-05, "elapsed_time_per_iteration": 6.72185755, "memory(GiB)": 21.51, "elapsed_time": "3h 39m 50s", "remaining_time": "7h 57m 1s", "loss_scale": 1.0, "consumed_samples": 513792, "global_step/max_steps": "2007/6362"} +{"lm loss": 5.01182318, "grad_norm": 0.67095745, "learning_rate": 8.246e-05, "elapsed_time_per_iteration": 6.57699776, "memory(GiB)": 21.51, "elapsed_time": "3h 39m 56s", "remaining_time": "7h 56m 54s", "loss_scale": 1.0, "consumed_samples": 514048, "global_step/max_steps": "2008/6362"} +{"lm loss": 5.01904631, "grad_norm": 0.61220467, "learning_rate": 8.244e-05, "elapsed_time_per_iteration": 6.62800789, "memory(GiB)": 21.51, "elapsed_time": "3h 40m 3s", "remaining_time": "7h 56m 48s", "loss_scale": 1.0, "consumed_samples": 514304, "global_step/max_steps": "2009/6362"} +{"lm loss": 5.01223326, "grad_norm": 0.64210868, "learning_rate": 8.242e-05, "elapsed_time_per_iteration": 6.63541222, "memory(GiB)": 21.51, "elapsed_time": "3h 40m 10s", "remaining_time": "7h 56m 42s", "loss_scale": 1.0, "consumed_samples": 514560, "global_step/max_steps": "2010/6362"} +{"lm loss": 5.003407, "grad_norm": 0.70985991, "learning_rate": 8.24e-05, "elapsed_time_per_iteration": 6.65141845, "memory(GiB)": 21.51, "elapsed_time": "3h 40m 16s", "remaining_time": "7h 56m 35s", "loss_scale": 1.0, "consumed_samples": 514816, "global_step/max_steps": "2011/6362"} +{"lm loss": 5.01282454, "grad_norm": 0.62554467, "learning_rate": 8.238e-05, "elapsed_time_per_iteration": 6.72439647, "memory(GiB)": 21.51, "elapsed_time": "3h 40m 23s", "remaining_time": "7h 56m 29s", "loss_scale": 1.0, "consumed_samples": 515072, "global_step/max_steps": "2012/6362"} +{"lm loss": 5.00176477, "grad_norm": 0.5875119, "learning_rate": 8.236e-05, "elapsed_time_per_iteration": 6.36771727, "memory(GiB)": 21.51, "elapsed_time": "3h 40m 29s", "remaining_time": "7h 56m 22s", "loss_scale": 1.0, "consumed_samples": 515328, "global_step/max_steps": "2013/6362"} +{"lm loss": 4.99395514, "grad_norm": 0.59891468, "learning_rate": 8.234e-05, "elapsed_time_per_iteration": 6.57229662, "memory(GiB)": 21.51, "elapsed_time": "3h 40m 36s", "remaining_time": "7h 56m 15s", "loss_scale": 1.0, "consumed_samples": 515584, "global_step/max_steps": "2014/6362"} +{"lm loss": 5.02072048, "grad_norm": 0.56513411, "learning_rate": 8.233e-05, "elapsed_time_per_iteration": 6.2708509, "memory(GiB)": 21.51, "elapsed_time": "3h 40m 42s", "remaining_time": "7h 56m 8s", "loss_scale": 1.0, "consumed_samples": 515840, "global_step/max_steps": "2015/6362"} +{"lm loss": 5.00411224, "grad_norm": 0.61087698, "learning_rate": 8.231e-05, "elapsed_time_per_iteration": 6.51566243, "memory(GiB)": 21.51, "elapsed_time": "3h 40m 49s", "remaining_time": "7h 56m 1s", "loss_scale": 1.0, "consumed_samples": 516096, "global_step/max_steps": "2016/6362"} +{"lm loss": 5.03161716, "grad_norm": 0.61748916, "learning_rate": 8.229e-05, "elapsed_time_per_iteration": 6.69239664, "memory(GiB)": 21.51, "elapsed_time": "3h 40m 55s", "remaining_time": "7h 55m 55s", "loss_scale": 1.0, "consumed_samples": 516352, "global_step/max_steps": "2017/6362"} +{"lm loss": 5.03676367, "grad_norm": 0.65885842, "learning_rate": 8.227e-05, "elapsed_time_per_iteration": 6.405586, "memory(GiB)": 21.51, "elapsed_time": "3h 41m 2s", "remaining_time": "7h 55m 48s", "loss_scale": 1.0, "consumed_samples": 516608, "global_step/max_steps": "2018/6362"} +{"lm loss": 4.99054623, "grad_norm": 0.73923981, "learning_rate": 8.225e-05, "elapsed_time_per_iteration": 6.38558888, "memory(GiB)": 21.51, "elapsed_time": "3h 41m 8s", "remaining_time": "7h 55m 41s", "loss_scale": 1.0, "consumed_samples": 516864, "global_step/max_steps": "2019/6362"} +{"lm loss": 5.03112793, "grad_norm": 0.91447395, "learning_rate": 8.223e-05, "elapsed_time_per_iteration": 6.73016191, "memory(GiB)": 21.51, "elapsed_time": "3h 41m 15s", "remaining_time": "7h 55m 35s", "loss_scale": 1.0, "consumed_samples": 517120, "global_step/max_steps": "2020/6362"} +{"lm loss": 5.01754522, "grad_norm": 0.83725727, "learning_rate": 8.221e-05, "elapsed_time_per_iteration": 6.43635345, "memory(GiB)": 21.51, "elapsed_time": "3h 41m 21s", "remaining_time": "7h 55m 28s", "loss_scale": 1.0, "consumed_samples": 517376, "global_step/max_steps": "2021/6362"} +{"lm loss": 5.02384853, "grad_norm": 0.60820651, "learning_rate": 8.219e-05, "elapsed_time_per_iteration": 6.70221663, "memory(GiB)": 21.51, "elapsed_time": "3h 41m 28s", "remaining_time": "7h 55m 22s", "loss_scale": 1.0, "consumed_samples": 517632, "global_step/max_steps": "2022/6362"} +{"lm loss": 5.01933336, "grad_norm": 0.60892493, "learning_rate": 8.217e-05, "elapsed_time_per_iteration": 6.31432772, "memory(GiB)": 21.51, "elapsed_time": "3h 41m 34s", "remaining_time": "7h 55m 15s", "loss_scale": 1.0, "consumed_samples": 517888, "global_step/max_steps": "2023/6362"} +{"lm loss": 5.00498199, "grad_norm": 0.68001044, "learning_rate": 8.215e-05, "elapsed_time_per_iteration": 6.48922825, "memory(GiB)": 21.51, "elapsed_time": "3h 41m 41s", "remaining_time": "7h 55m 8s", "loss_scale": 1.0, "consumed_samples": 518144, "global_step/max_steps": "2024/6362"} +{"lm loss": 5.01917744, "grad_norm": 0.7736941, "learning_rate": 8.213e-05, "elapsed_time_per_iteration": 6.54824305, "memory(GiB)": 21.51, "elapsed_time": "3h 41m 47s", "remaining_time": "7h 55m 1s", "loss_scale": 1.0, "consumed_samples": 518400, "global_step/max_steps": "2025/6362"} +{"lm loss": 5.00216389, "grad_norm": 0.85354012, "learning_rate": 8.211e-05, "elapsed_time_per_iteration": 6.8333807, "memory(GiB)": 21.51, "elapsed_time": "3h 41m 54s", "remaining_time": "7h 54m 55s", "loss_scale": 1.0, "consumed_samples": 518656, "global_step/max_steps": "2026/6362"} +{"lm loss": 5.02444029, "grad_norm": 0.87782514, "learning_rate": 8.209e-05, "elapsed_time_per_iteration": 6.4618969, "memory(GiB)": 21.51, "elapsed_time": "3h 42m 1s", "remaining_time": "7h 54m 48s", "loss_scale": 1.0, "consumed_samples": 518912, "global_step/max_steps": "2027/6362"} +{"lm loss": 5.03022766, "grad_norm": 0.86541724, "learning_rate": 8.207e-05, "elapsed_time_per_iteration": 6.4930563, "memory(GiB)": 21.51, "elapsed_time": "3h 42m 7s", "remaining_time": "7h 54m 42s", "loss_scale": 1.0, "consumed_samples": 519168, "global_step/max_steps": "2028/6362"} +{"lm loss": 5.00871372, "grad_norm": 0.75857288, "learning_rate": 8.205e-05, "elapsed_time_per_iteration": 6.45606852, "memory(GiB)": 21.51, "elapsed_time": "3h 42m 14s", "remaining_time": "7h 54m 35s", "loss_scale": 1.0, "consumed_samples": 519424, "global_step/max_steps": "2029/6362"} +{"lm loss": 5.00503445, "grad_norm": 0.70501679, "learning_rate": 8.203e-05, "elapsed_time_per_iteration": 6.50084114, "memory(GiB)": 21.51, "elapsed_time": "3h 42m 20s", "remaining_time": "7h 54m 28s", "loss_scale": 1.0, "consumed_samples": 519680, "global_step/max_steps": "2030/6362"} +{"lm loss": 5.00579166, "grad_norm": 0.58316624, "learning_rate": 8.201e-05, "elapsed_time_per_iteration": 6.83018184, "memory(GiB)": 21.51, "elapsed_time": "3h 42m 27s", "remaining_time": "7h 54m 22s", "loss_scale": 1.0, "consumed_samples": 519936, "global_step/max_steps": "2031/6362"} +{"lm loss": 5.02537584, "grad_norm": 0.7394256, "learning_rate": 8.199e-05, "elapsed_time_per_iteration": 6.5623157, "memory(GiB)": 21.51, "elapsed_time": "3h 42m 33s", "remaining_time": "7h 54m 16s", "loss_scale": 1.0, "consumed_samples": 520192, "global_step/max_steps": "2032/6362"} +{"lm loss": 5.03478527, "grad_norm": 0.81835091, "learning_rate": 8.197e-05, "elapsed_time_per_iteration": 6.47366714, "memory(GiB)": 21.51, "elapsed_time": "3h 42m 40s", "remaining_time": "7h 54m 9s", "loss_scale": 1.0, "consumed_samples": 520448, "global_step/max_steps": "2033/6362"} +{"lm loss": 4.99677896, "grad_norm": 0.76657665, "learning_rate": 8.195e-05, "elapsed_time_per_iteration": 6.51158285, "memory(GiB)": 21.51, "elapsed_time": "3h 42m 46s", "remaining_time": "7h 54m 2s", "loss_scale": 1.0, "consumed_samples": 520704, "global_step/max_steps": "2034/6362"} +{"lm loss": 5.03494263, "grad_norm": 0.73089516, "learning_rate": 8.193e-05, "elapsed_time_per_iteration": 6.48351526, "memory(GiB)": 21.51, "elapsed_time": "3h 42m 53s", "remaining_time": "7h 53m 55s", "loss_scale": 1.0, "consumed_samples": 520960, "global_step/max_steps": "2035/6362"} +{"lm loss": 5.02491093, "grad_norm": 0.66600609, "learning_rate": 8.191e-05, "elapsed_time_per_iteration": 6.65502715, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 0s", "remaining_time": "7h 53m 49s", "loss_scale": 1.0, "consumed_samples": 521216, "global_step/max_steps": "2036/6362"} +{"lm loss": 5.04039526, "grad_norm": 0.67840809, "learning_rate": 8.19e-05, "elapsed_time_per_iteration": 6.53503609, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 6s", "remaining_time": "7h 53m 42s", "loss_scale": 1.0, "consumed_samples": 521472, "global_step/max_steps": "2037/6362"} +{"lm loss": 5.01350164, "grad_norm": 0.76550514, "learning_rate": 8.188e-05, "elapsed_time_per_iteration": 6.2837255, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 12s", "remaining_time": "7h 53m 35s", "loss_scale": 1.0, "consumed_samples": 521728, "global_step/max_steps": "2038/6362"} +{"lm loss": 5.01378441, "grad_norm": 0.63816345, "learning_rate": 8.186e-05, "elapsed_time_per_iteration": 6.35783958, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 19s", "remaining_time": "7h 53m 28s", "loss_scale": 1.0, "consumed_samples": 521984, "global_step/max_steps": "2039/6362"} +{"lm loss": 5.01006365, "grad_norm": 0.72107339, "learning_rate": 8.184e-05, "elapsed_time_per_iteration": 6.75381351, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 26s", "remaining_time": "7h 53m 22s", "loss_scale": 1.0, "consumed_samples": 522240, "global_step/max_steps": "2040/6362"} +{"lm loss": 5.02594042, "grad_norm": 0.65975988, "learning_rate": 8.182e-05, "elapsed_time_per_iteration": 6.53699565, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 32s", "remaining_time": "7h 53m 15s", "loss_scale": 1.0, "consumed_samples": 522496, "global_step/max_steps": "2041/6362"} +{"lm loss": 5.03313971, "grad_norm": 0.59305936, "learning_rate": 8.18e-05, "elapsed_time_per_iteration": 6.38342452, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 38s", "remaining_time": "7h 53m 8s", "loss_scale": 1.0, "consumed_samples": 522752, "global_step/max_steps": "2042/6362"} +{"lm loss": 5.02545738, "grad_norm": 0.5808183, "learning_rate": 8.178e-05, "elapsed_time_per_iteration": 6.5594933, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 45s", "remaining_time": "7h 53m 2s", "loss_scale": 1.0, "consumed_samples": 523008, "global_step/max_steps": "2043/6362"} +{"lm loss": 4.99267912, "grad_norm": 0.63322932, "learning_rate": 8.176e-05, "elapsed_time_per_iteration": 6.34282875, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 51s", "remaining_time": "7h 52m 55s", "loss_scale": 1.0, "consumed_samples": 523264, "global_step/max_steps": "2044/6362"} +{"lm loss": 5.02348328, "grad_norm": 0.71591288, "learning_rate": 8.174e-05, "elapsed_time_per_iteration": 6.40289235, "memory(GiB)": 21.51, "elapsed_time": "3h 43m 58s", "remaining_time": "7h 52m 48s", "loss_scale": 1.0, "consumed_samples": 523520, "global_step/max_steps": "2045/6362"} +{"lm loss": 5.05135393, "grad_norm": 0.72552073, "learning_rate": 8.172e-05, "elapsed_time_per_iteration": 6.59896779, "memory(GiB)": 21.51, "elapsed_time": "3h 44m 4s", "remaining_time": "7h 52m 41s", "loss_scale": 1.0, "consumed_samples": 523776, "global_step/max_steps": "2046/6362"} +{"lm loss": 5.00305891, "grad_norm": 0.81308073, "learning_rate": 8.17e-05, "elapsed_time_per_iteration": 6.42746019, "memory(GiB)": 21.51, "elapsed_time": "3h 44m 11s", "remaining_time": "7h 52m 34s", "loss_scale": 1.0, "consumed_samples": 524032, "global_step/max_steps": "2047/6362"} +{"lm loss": 5.02271271, "grad_norm": 0.88520122, "learning_rate": 8.168e-05, "elapsed_time_per_iteration": 6.34703255, "memory(GiB)": 21.51, "elapsed_time": "3h 44m 17s", "remaining_time": "7h 52m 27s", "loss_scale": 1.0, "consumed_samples": 524288, "global_step/max_steps": "2048/6362"} +{"lm loss": 5.00837088, "grad_norm": 0.8467896, "learning_rate": 8.166e-05, "elapsed_time_per_iteration": 6.68204498, "memory(GiB)": 21.51, "elapsed_time": "3h 44m 24s", "remaining_time": "7h 52m 21s", "loss_scale": 1.0, "consumed_samples": 524544, "global_step/max_steps": "2049/6362"} +{"lm loss": 5.03130531, "grad_norm": 0.70840967, "learning_rate": 8.164e-05, "elapsed_time_per_iteration": 6.49915624, "memory(GiB)": 21.51, "elapsed_time": "3h 44m 30s", "remaining_time": "7h 52m 14s", "loss_scale": 1.0, "consumed_samples": 524800, "global_step/max_steps": "2050/6362"} +{"lm loss": 5.03052616, "grad_norm": 0.71936542, "learning_rate": 8.162e-05, "elapsed_time_per_iteration": 6.71615648, "memory(GiB)": 21.51, "elapsed_time": "3h 44m 37s", "remaining_time": "7h 52m 8s", "loss_scale": 1.0, "consumed_samples": 525056, "global_step/max_steps": "2051/6362"} +{"lm loss": 5.00727892, "grad_norm": 0.77300757, "learning_rate": 8.16e-05, "elapsed_time_per_iteration": 6.73026586, "memory(GiB)": 21.51, "elapsed_time": "3h 44m 44s", "remaining_time": "7h 52m 2s", "loss_scale": 1.0, "consumed_samples": 525312, "global_step/max_steps": "2052/6362"} +{"lm loss": 5.00458479, "grad_norm": 0.63066608, "learning_rate": 8.158e-05, "elapsed_time_per_iteration": 6.79432201, "memory(GiB)": 21.51, "elapsed_time": "3h 44m 51s", "remaining_time": "7h 51m 56s", "loss_scale": 1.0, "consumed_samples": 525568, "global_step/max_steps": "2053/6362"} +{"lm loss": 5.036695, "grad_norm": 0.71041942, "learning_rate": 8.156e-05, "elapsed_time_per_iteration": 6.47824764, "memory(GiB)": 21.51, "elapsed_time": "3h 44m 57s", "remaining_time": "7h 51m 49s", "loss_scale": 1.0, "consumed_samples": 525824, "global_step/max_steps": "2054/6362"} +{"lm loss": 5.03615713, "grad_norm": 0.63732773, "learning_rate": 8.154e-05, "elapsed_time_per_iteration": 6.40373635, "memory(GiB)": 21.51, "elapsed_time": "3h 45m 3s", "remaining_time": "7h 51m 42s", "loss_scale": 1.0, "consumed_samples": 526080, "global_step/max_steps": "2055/6362"} +{"lm loss": 5.00474262, "grad_norm": 0.58333135, "learning_rate": 8.152e-05, "elapsed_time_per_iteration": 6.65832758, "memory(GiB)": 21.51, "elapsed_time": "3h 45m 10s", "remaining_time": "7h 51m 36s", "loss_scale": 1.0, "consumed_samples": 526336, "global_step/max_steps": "2056/6362"} +{"lm loss": 5.01452541, "grad_norm": 0.70015913, "learning_rate": 8.15e-05, "elapsed_time_per_iteration": 6.43821883, "memory(GiB)": 21.51, "elapsed_time": "3h 45m 17s", "remaining_time": "7h 51m 29s", "loss_scale": 1.0, "consumed_samples": 526592, "global_step/max_steps": "2057/6362"} +{"lm loss": 5.01853561, "grad_norm": 0.81547296, "learning_rate": 8.148e-05, "elapsed_time_per_iteration": 6.60882735, "memory(GiB)": 21.51, "elapsed_time": "3h 45m 23s", "remaining_time": "7h 51m 22s", "loss_scale": 1.0, "consumed_samples": 526848, "global_step/max_steps": "2058/6362"} +{"lm loss": 5.00768375, "grad_norm": 0.78737253, "learning_rate": 8.146e-05, "elapsed_time_per_iteration": 6.42007399, "memory(GiB)": 21.51, "elapsed_time": "3h 45m 30s", "remaining_time": "7h 51m 15s", "loss_scale": 1.0, "consumed_samples": 527104, "global_step/max_steps": "2059/6362"} +{"lm loss": 5.00978518, "grad_norm": 0.69876063, "learning_rate": 8.144e-05, "elapsed_time_per_iteration": 6.40327716, "memory(GiB)": 21.51, "elapsed_time": "3h 45m 36s", "remaining_time": "7h 51m 8s", "loss_scale": 1.0, "consumed_samples": 527360, "global_step/max_steps": "2060/6362"} +{"lm loss": 5.00898743, "grad_norm": 0.74211144, "learning_rate": 8.142e-05, "elapsed_time_per_iteration": 6.64093709, "memory(GiB)": 21.51, "elapsed_time": "3h 45m 43s", "remaining_time": "7h 51m 2s", "loss_scale": 1.0, "consumed_samples": 527616, "global_step/max_steps": "2061/6362"} +{"lm loss": 5.01845455, "grad_norm": 0.79473996, "learning_rate": 8.14e-05, "elapsed_time_per_iteration": 6.56442499, "memory(GiB)": 21.51, "elapsed_time": "3h 45m 49s", "remaining_time": "7h 50m 55s", "loss_scale": 1.0, "consumed_samples": 527872, "global_step/max_steps": "2062/6362"} +{"lm loss": 4.99607229, "grad_norm": 0.7278195, "learning_rate": 8.138e-05, "elapsed_time_per_iteration": 6.66225886, "memory(GiB)": 21.51, "elapsed_time": "3h 45m 56s", "remaining_time": "7h 50m 49s", "loss_scale": 1.0, "consumed_samples": 528128, "global_step/max_steps": "2063/6362"} +{"lm loss": 4.99609661, "grad_norm": 0.86840558, "learning_rate": 8.136e-05, "elapsed_time_per_iteration": 6.47201228, "memory(GiB)": 21.51, "elapsed_time": "3h 46m 2s", "remaining_time": "7h 50m 42s", "loss_scale": 1.0, "consumed_samples": 528384, "global_step/max_steps": "2064/6362"} +{"lm loss": 5.04714251, "grad_norm": 0.89360416, "learning_rate": 8.134e-05, "elapsed_time_per_iteration": 6.44743633, "memory(GiB)": 21.51, "elapsed_time": "3h 46m 9s", "remaining_time": "7h 50m 35s", "loss_scale": 1.0, "consumed_samples": 528640, "global_step/max_steps": "2065/6362"} +{"lm loss": 5.02760363, "grad_norm": 0.74332231, "learning_rate": 8.132e-05, "elapsed_time_per_iteration": 6.64427567, "memory(GiB)": 21.51, "elapsed_time": "3h 46m 15s", "remaining_time": "7h 50m 29s", "loss_scale": 1.0, "consumed_samples": 528896, "global_step/max_steps": "2066/6362"} +{"lm loss": 5.01469707, "grad_norm": 0.67566395, "learning_rate": 8.13e-05, "elapsed_time_per_iteration": 6.8003304, "memory(GiB)": 21.51, "elapsed_time": "3h 46m 22s", "remaining_time": "7h 50m 23s", "loss_scale": 1.0, "consumed_samples": 529152, "global_step/max_steps": "2067/6362"} +{"lm loss": 5.00017786, "grad_norm": 0.52927154, "learning_rate": 8.128e-05, "elapsed_time_per_iteration": 6.50911307, "memory(GiB)": 21.51, "elapsed_time": "3h 46m 29s", "remaining_time": "7h 50m 16s", "loss_scale": 1.0, "consumed_samples": 529408, "global_step/max_steps": "2068/6362"} +{"lm loss": 5.0237689, "grad_norm": 0.54191583, "learning_rate": 8.126e-05, "elapsed_time_per_iteration": 6.42093658, "memory(GiB)": 21.51, "elapsed_time": "3h 46m 35s", "remaining_time": "7h 50m 9s", "loss_scale": 1.0, "consumed_samples": 529664, "global_step/max_steps": "2069/6362"} +{"lm loss": 5.01655436, "grad_norm": 0.62615716, "learning_rate": 8.124e-05, "elapsed_time_per_iteration": 6.44782686, "memory(GiB)": 21.51, "elapsed_time": "3h 46m 42s", "remaining_time": "7h 50m 2s", "loss_scale": 1.0, "consumed_samples": 529920, "global_step/max_steps": "2070/6362"} +{"lm loss": 5.01918888, "grad_norm": 0.69353759, "learning_rate": 8.122e-05, "elapsed_time_per_iteration": 6.28063869, "memory(GiB)": 21.51, "elapsed_time": "3h 46m 48s", "remaining_time": "7h 49m 55s", "loss_scale": 1.0, "consumed_samples": 530176, "global_step/max_steps": "2071/6362"} +{"lm loss": 5.01732445, "grad_norm": 0.69711226, "learning_rate": 8.12e-05, "elapsed_time_per_iteration": 6.14011168, "memory(GiB)": 21.51, "elapsed_time": "3h 46m 54s", "remaining_time": "7h 49m 48s", "loss_scale": 1.0, "consumed_samples": 530432, "global_step/max_steps": "2072/6362"} +{"lm loss": 5.01291037, "grad_norm": 0.65776819, "learning_rate": 8.118e-05, "elapsed_time_per_iteration": 6.50737906, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 1s", "remaining_time": "7h 49m 41s", "loss_scale": 1.0, "consumed_samples": 530688, "global_step/max_steps": "2073/6362"} +{"lm loss": 5.01674938, "grad_norm": 0.6038776, "learning_rate": 8.116e-05, "elapsed_time_per_iteration": 6.54794073, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 7s", "remaining_time": "7h 49m 35s", "loss_scale": 1.0, "consumed_samples": 530944, "global_step/max_steps": "2074/6362"} +{"lm loss": 4.99916697, "grad_norm": 0.58397269, "learning_rate": 8.114e-05, "elapsed_time_per_iteration": 6.58144474, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 14s", "remaining_time": "7h 49m 28s", "loss_scale": 1.0, "consumed_samples": 531200, "global_step/max_steps": "2075/6362"} +{"lm loss": 5.02586126, "grad_norm": 0.669644, "learning_rate": 8.112e-05, "elapsed_time_per_iteration": 6.51383615, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 20s", "remaining_time": "7h 49m 21s", "loss_scale": 1.0, "consumed_samples": 531456, "global_step/max_steps": "2076/6362"} +{"lm loss": 5.03100967, "grad_norm": 0.63791156, "learning_rate": 8.11e-05, "elapsed_time_per_iteration": 6.5627749, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 27s", "remaining_time": "7h 49m 15s", "loss_scale": 1.0, "consumed_samples": 531712, "global_step/max_steps": "2077/6362"} +{"lm loss": 5.00214767, "grad_norm": 0.60804611, "learning_rate": 8.108e-05, "elapsed_time_per_iteration": 6.4037559, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 33s", "remaining_time": "7h 49m 8s", "loss_scale": 1.0, "consumed_samples": 531968, "global_step/max_steps": "2078/6362"} +{"lm loss": 5.02020407, "grad_norm": 0.7496922, "learning_rate": 8.106e-05, "elapsed_time_per_iteration": 6.52356935, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 40s", "remaining_time": "7h 49m 1s", "loss_scale": 1.0, "consumed_samples": 532224, "global_step/max_steps": "2079/6362"} +{"lm loss": 5.02303934, "grad_norm": 0.72098899, "learning_rate": 8.104e-05, "elapsed_time_per_iteration": 6.62866759, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 46s", "remaining_time": "7h 48m 55s", "loss_scale": 1.0, "consumed_samples": 532480, "global_step/max_steps": "2080/6362"} +{"lm loss": 5.02240562, "grad_norm": 0.68449444, "learning_rate": 8.102e-05, "elapsed_time_per_iteration": 6.59241557, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 53s", "remaining_time": "7h 48m 48s", "loss_scale": 1.0, "consumed_samples": 532736, "global_step/max_steps": "2081/6362"} +{"lm loss": 5.02681398, "grad_norm": 0.56497836, "learning_rate": 8.1e-05, "elapsed_time_per_iteration": 6.39436316, "memory(GiB)": 21.51, "elapsed_time": "3h 47m 59s", "remaining_time": "7h 48m 41s", "loss_scale": 1.0, "consumed_samples": 532992, "global_step/max_steps": "2082/6362"} +{"lm loss": 5.03411293, "grad_norm": 0.65973771, "learning_rate": 8.098e-05, "elapsed_time_per_iteration": 6.64329863, "memory(GiB)": 21.51, "elapsed_time": "3h 48m 6s", "remaining_time": "7h 48m 35s", "loss_scale": 1.0, "consumed_samples": 533248, "global_step/max_steps": "2083/6362"} +{"lm loss": 5.02456951, "grad_norm": 0.74438387, "learning_rate": 8.096e-05, "elapsed_time_per_iteration": 6.43943787, "memory(GiB)": 21.51, "elapsed_time": "3h 48m 12s", "remaining_time": "7h 48m 28s", "loss_scale": 1.0, "consumed_samples": 533504, "global_step/max_steps": "2084/6362"} +{"lm loss": 5.00925303, "grad_norm": 0.80994588, "learning_rate": 8.094e-05, "elapsed_time_per_iteration": 6.62006807, "memory(GiB)": 21.51, "elapsed_time": "3h 48m 19s", "remaining_time": "7h 48m 21s", "loss_scale": 1.0, "consumed_samples": 533760, "global_step/max_steps": "2085/6362"} +{"lm loss": 5.00608683, "grad_norm": 0.75390255, "learning_rate": 8.092e-05, "elapsed_time_per_iteration": 6.5942142, "memory(GiB)": 21.51, "elapsed_time": "3h 48m 26s", "remaining_time": "7h 48m 15s", "loss_scale": 1.0, "consumed_samples": 534016, "global_step/max_steps": "2086/6362"} +{"lm loss": 5.00337172, "grad_norm": 0.64363718, "learning_rate": 8.09e-05, "elapsed_time_per_iteration": 6.58082008, "memory(GiB)": 21.51, "elapsed_time": "3h 48m 32s", "remaining_time": "7h 48m 8s", "loss_scale": 1.0, "consumed_samples": 534272, "global_step/max_steps": "2087/6362"} +{"lm loss": 5.02074957, "grad_norm": 0.56694454, "learning_rate": 8.088e-05, "elapsed_time_per_iteration": 6.68681931, "memory(GiB)": 21.51, "elapsed_time": "3h 48m 39s", "remaining_time": "7h 48m 2s", "loss_scale": 1.0, "consumed_samples": 534528, "global_step/max_steps": "2088/6362"} +{"lm loss": 5.02652884, "grad_norm": 0.64686865, "learning_rate": 8.086e-05, "elapsed_time_per_iteration": 6.57600045, "memory(GiB)": 21.51, "elapsed_time": "3h 48m 45s", "remaining_time": "7h 47m 56s", "loss_scale": 1.0, "consumed_samples": 534784, "global_step/max_steps": "2089/6362"} +{"lm loss": 5.03239679, "grad_norm": 0.63103354, "learning_rate": 8.084e-05, "elapsed_time_per_iteration": 6.41545033, "memory(GiB)": 21.51, "elapsed_time": "3h 48m 52s", "remaining_time": "7h 47m 49s", "loss_scale": 1.0, "consumed_samples": 535040, "global_step/max_steps": "2090/6362"} +{"lm loss": 5.0100255, "grad_norm": 0.58740526, "learning_rate": 8.082e-05, "elapsed_time_per_iteration": 6.52026439, "memory(GiB)": 21.51, "elapsed_time": "3h 48m 58s", "remaining_time": "7h 47m 42s", "loss_scale": 1.0, "consumed_samples": 535296, "global_step/max_steps": "2091/6362"} +{"lm loss": 5.00638628, "grad_norm": 0.65637362, "learning_rate": 8.08e-05, "elapsed_time_per_iteration": 6.55718136, "memory(GiB)": 21.51, "elapsed_time": "3h 49m 5s", "remaining_time": "7h 47m 35s", "loss_scale": 1.0, "consumed_samples": 535552, "global_step/max_steps": "2092/6362"} +{"lm loss": 5.02164888, "grad_norm": 0.73247313, "learning_rate": 8.078e-05, "elapsed_time_per_iteration": 6.63960123, "memory(GiB)": 21.51, "elapsed_time": "3h 49m 12s", "remaining_time": "7h 47m 29s", "loss_scale": 1.0, "consumed_samples": 535808, "global_step/max_steps": "2093/6362"} +{"lm loss": 4.99744892, "grad_norm": 0.87379497, "learning_rate": 8.076e-05, "elapsed_time_per_iteration": 6.50192189, "memory(GiB)": 21.51, "elapsed_time": "3h 49m 18s", "remaining_time": "7h 47m 22s", "loss_scale": 1.0, "consumed_samples": 536064, "global_step/max_steps": "2094/6362"} +{"lm loss": 5.01250935, "grad_norm": 1.05952835, "learning_rate": 8.074e-05, "elapsed_time_per_iteration": 6.51455092, "memory(GiB)": 21.51, "elapsed_time": "3h 49m 25s", "remaining_time": "7h 47m 16s", "loss_scale": 1.0, "consumed_samples": 536320, "global_step/max_steps": "2095/6362"} +{"lm loss": 5.01504755, "grad_norm": 0.97534937, "learning_rate": 8.072e-05, "elapsed_time_per_iteration": 6.83860588, "memory(GiB)": 21.51, "elapsed_time": "3h 49m 31s", "remaining_time": "7h 47m 10s", "loss_scale": 1.0, "consumed_samples": 536576, "global_step/max_steps": "2096/6362"} +{"lm loss": 5.00955629, "grad_norm": 0.79165506, "learning_rate": 8.07e-05, "elapsed_time_per_iteration": 6.47546482, "memory(GiB)": 21.51, "elapsed_time": "3h 49m 38s", "remaining_time": "7h 47m 3s", "loss_scale": 1.0, "consumed_samples": 536832, "global_step/max_steps": "2097/6362"} +{"lm loss": 5.01517248, "grad_norm": 0.6013394, "learning_rate": 8.068e-05, "elapsed_time_per_iteration": 6.47777462, "memory(GiB)": 21.51, "elapsed_time": "3h 49m 44s", "remaining_time": "7h 46m 56s", "loss_scale": 1.0, "consumed_samples": 537088, "global_step/max_steps": "2098/6362"} +{"lm loss": 5.00203753, "grad_norm": 0.78613579, "learning_rate": 8.066e-05, "elapsed_time_per_iteration": 6.65433335, "memory(GiB)": 21.51, "elapsed_time": "3h 49m 51s", "remaining_time": "7h 46m 50s", "loss_scale": 1.0, "consumed_samples": 537344, "global_step/max_steps": "2099/6362"} +{"lm loss": 5.02467155, "grad_norm": 0.89970666, "learning_rate": 8.064e-05, "elapsed_time_per_iteration": 6.64612269, "memory(GiB)": 21.51, "elapsed_time": "3h 49m 58s", "remaining_time": "7h 46m 43s", "loss_scale": 1.0, "consumed_samples": 537600, "global_step/max_steps": "2100/6362"} +{"lm loss": 5.00272322, "grad_norm": 0.69957918, "learning_rate": 8.062e-05, "elapsed_time_per_iteration": 6.73739004, "memory(GiB)": 21.51, "elapsed_time": "3h 50m 4s", "remaining_time": "7h 46m 37s", "loss_scale": 1.0, "consumed_samples": 537856, "global_step/max_steps": "2101/6362"} +{"lm loss": 4.99049044, "grad_norm": 0.60631174, "learning_rate": 8.06e-05, "elapsed_time_per_iteration": 6.71093273, "memory(GiB)": 21.51, "elapsed_time": "3h 50m 11s", "remaining_time": "7h 46m 31s", "loss_scale": 1.0, "consumed_samples": 538112, "global_step/max_steps": "2102/6362"} +{"lm loss": 5.02817917, "grad_norm": 0.77399069, "learning_rate": 8.058e-05, "elapsed_time_per_iteration": 6.62253857, "memory(GiB)": 21.51, "elapsed_time": "3h 50m 18s", "remaining_time": "7h 46m 24s", "loss_scale": 1.0, "consumed_samples": 538368, "global_step/max_steps": "2103/6362"} +{"lm loss": 5.00493288, "grad_norm": 0.83058602, "learning_rate": 8.056e-05, "elapsed_time_per_iteration": 6.58770728, "memory(GiB)": 21.51, "elapsed_time": "3h 50m 24s", "remaining_time": "7h 46m 18s", "loss_scale": 1.0, "consumed_samples": 538624, "global_step/max_steps": "2104/6362"} +{"lm loss": 5.00696516, "grad_norm": 0.72872597, "learning_rate": 8.054e-05, "elapsed_time_per_iteration": 6.74298215, "memory(GiB)": 21.51, "elapsed_time": "3h 50m 31s", "remaining_time": "7h 46m 11s", "loss_scale": 1.0, "consumed_samples": 538880, "global_step/max_steps": "2105/6362"} +{"lm loss": 5.02727509, "grad_norm": 0.73197055, "learning_rate": 8.052e-05, "elapsed_time_per_iteration": 6.72146726, "memory(GiB)": 21.51, "elapsed_time": "3h 50m 38s", "remaining_time": "7h 46m 5s", "loss_scale": 1.0, "consumed_samples": 539136, "global_step/max_steps": "2106/6362"} +{"lm loss": 5.01160669, "grad_norm": 0.66122663, "learning_rate": 8.05e-05, "elapsed_time_per_iteration": 6.55297494, "memory(GiB)": 21.51, "elapsed_time": "3h 50m 44s", "remaining_time": "7h 45m 59s", "loss_scale": 1.0, "consumed_samples": 539392, "global_step/max_steps": "2107/6362"} +{"lm loss": 4.99445772, "grad_norm": 0.65536916, "learning_rate": 8.048e-05, "elapsed_time_per_iteration": 6.54138422, "memory(GiB)": 21.51, "elapsed_time": "3h 50m 51s", "remaining_time": "7h 45m 52s", "loss_scale": 1.0, "consumed_samples": 539648, "global_step/max_steps": "2108/6362"} +{"lm loss": 5.02978945, "grad_norm": 0.62871909, "learning_rate": 8.046e-05, "elapsed_time_per_iteration": 6.583601, "memory(GiB)": 21.51, "elapsed_time": "3h 50m 57s", "remaining_time": "7h 45m 45s", "loss_scale": 1.0, "consumed_samples": 539904, "global_step/max_steps": "2109/6362"} +{"lm loss": 5.01308537, "grad_norm": 0.54059565, "learning_rate": 8.044e-05, "elapsed_time_per_iteration": 6.81823564, "memory(GiB)": 21.51, "elapsed_time": "3h 51m 4s", "remaining_time": "7h 45m 39s", "loss_scale": 1.0, "consumed_samples": 540160, "global_step/max_steps": "2110/6362"} +{"lm loss": 5.02499342, "grad_norm": 0.60241556, "learning_rate": 8.042e-05, "elapsed_time_per_iteration": 6.70924711, "memory(GiB)": 21.51, "elapsed_time": "3h 51m 11s", "remaining_time": "7h 45m 33s", "loss_scale": 1.0, "consumed_samples": 540416, "global_step/max_steps": "2111/6362"} +{"lm loss": 5.0275979, "grad_norm": 0.60388201, "learning_rate": 8.04e-05, "elapsed_time_per_iteration": 6.80384636, "memory(GiB)": 21.51, "elapsed_time": "3h 51m 18s", "remaining_time": "7h 45m 27s", "loss_scale": 1.0, "consumed_samples": 540672, "global_step/max_steps": "2112/6362"} +{"lm loss": 5.03306246, "grad_norm": 0.59995306, "learning_rate": 8.038e-05, "elapsed_time_per_iteration": 6.79771948, "memory(GiB)": 21.51, "elapsed_time": "3h 51m 25s", "remaining_time": "7h 45m 21s", "loss_scale": 1.0, "consumed_samples": 540928, "global_step/max_steps": "2113/6362"} +{"lm loss": 5.00077057, "grad_norm": 0.54647809, "learning_rate": 8.036e-05, "elapsed_time_per_iteration": 6.68981409, "memory(GiB)": 21.51, "elapsed_time": "3h 51m 31s", "remaining_time": "7h 45m 14s", "loss_scale": 1.0, "consumed_samples": 541184, "global_step/max_steps": "2114/6362"} +{"lm loss": 5.02181053, "grad_norm": 0.55631727, "learning_rate": 8.034e-05, "elapsed_time_per_iteration": 6.68039417, "memory(GiB)": 21.51, "elapsed_time": "3h 51m 38s", "remaining_time": "7h 45m 8s", "loss_scale": 1.0, "consumed_samples": 541440, "global_step/max_steps": "2115/6362"} +{"lm loss": 4.99216032, "grad_norm": 0.5478723, "learning_rate": 8.032e-05, "elapsed_time_per_iteration": 6.60202599, "memory(GiB)": 21.51, "elapsed_time": "3h 51m 45s", "remaining_time": "7h 45m 2s", "loss_scale": 1.0, "consumed_samples": 541696, "global_step/max_steps": "2116/6362"} +{"lm loss": 5.02415657, "grad_norm": 0.56153977, "learning_rate": 8.03e-05, "elapsed_time_per_iteration": 6.4342742, "memory(GiB)": 21.51, "elapsed_time": "3h 51m 51s", "remaining_time": "7h 44m 55s", "loss_scale": 1.0, "consumed_samples": 541952, "global_step/max_steps": "2117/6362"} +{"lm loss": 5.03630257, "grad_norm": 0.60365635, "learning_rate": 8.028e-05, "elapsed_time_per_iteration": 6.53115463, "memory(GiB)": 21.51, "elapsed_time": "3h 51m 58s", "remaining_time": "7h 44m 48s", "loss_scale": 1.0, "consumed_samples": 542208, "global_step/max_steps": "2118/6362"} +{"lm loss": 5.0124507, "grad_norm": 0.65517759, "learning_rate": 8.026e-05, "elapsed_time_per_iteration": 6.4939723, "memory(GiB)": 21.51, "elapsed_time": "3h 52m 4s", "remaining_time": "7h 44m 41s", "loss_scale": 1.0, "consumed_samples": 542464, "global_step/max_steps": "2119/6362"} +{"lm loss": 5.01508665, "grad_norm": 0.72730649, "learning_rate": 8.024e-05, "elapsed_time_per_iteration": 6.84218764, "memory(GiB)": 21.51, "elapsed_time": "3h 52m 11s", "remaining_time": "7h 44m 35s", "loss_scale": 1.0, "consumed_samples": 542720, "global_step/max_steps": "2120/6362"} +{"lm loss": 5.01345682, "grad_norm": 0.78752369, "learning_rate": 8.022e-05, "elapsed_time_per_iteration": 6.37747598, "memory(GiB)": 21.51, "elapsed_time": "3h 52m 17s", "remaining_time": "7h 44m 28s", "loss_scale": 1.0, "consumed_samples": 542976, "global_step/max_steps": "2121/6362"} +{"lm loss": 5.02231598, "grad_norm": 0.68109238, "learning_rate": 8.02e-05, "elapsed_time_per_iteration": 6.59293747, "memory(GiB)": 21.51, "elapsed_time": "3h 52m 24s", "remaining_time": "7h 44m 22s", "loss_scale": 1.0, "consumed_samples": 543232, "global_step/max_steps": "2122/6362"} +{"lm loss": 5.00786209, "grad_norm": 0.54542559, "learning_rate": 8.018e-05, "elapsed_time_per_iteration": 6.7501049, "memory(GiB)": 21.51, "elapsed_time": "3h 52m 31s", "remaining_time": "7h 44m 16s", "loss_scale": 1.0, "consumed_samples": 543488, "global_step/max_steps": "2123/6362"} +{"lm loss": 4.98927259, "grad_norm": 0.60589087, "learning_rate": 8.016e-05, "elapsed_time_per_iteration": 6.87544227, "memory(GiB)": 21.51, "elapsed_time": "3h 52m 37s", "remaining_time": "7h 44m 10s", "loss_scale": 1.0, "consumed_samples": 543744, "global_step/max_steps": "2124/6362"} +{"lm loss": 4.99356937, "grad_norm": 0.72348601, "learning_rate": 8.014e-05, "elapsed_time_per_iteration": 6.62458801, "memory(GiB)": 21.51, "elapsed_time": "3h 52m 44s", "remaining_time": "7h 44m 3s", "loss_scale": 1.0, "consumed_samples": 544000, "global_step/max_steps": "2125/6362"} +{"lm loss": 4.98563623, "grad_norm": 0.76132649, "learning_rate": 8.011e-05, "elapsed_time_per_iteration": 6.53125906, "memory(GiB)": 21.51, "elapsed_time": "3h 52m 51s", "remaining_time": "7h 43m 57s", "loss_scale": 1.0, "consumed_samples": 544256, "global_step/max_steps": "2126/6362"} +{"lm loss": 5.01219463, "grad_norm": 0.79695344, "learning_rate": 8.009e-05, "elapsed_time_per_iteration": 6.6650176, "memory(GiB)": 21.51, "elapsed_time": "3h 52m 57s", "remaining_time": "7h 43m 50s", "loss_scale": 1.0, "consumed_samples": 544512, "global_step/max_steps": "2127/6362"} +{"lm loss": 5.02372885, "grad_norm": 0.84246814, "learning_rate": 8.007e-05, "elapsed_time_per_iteration": 6.52519369, "memory(GiB)": 21.51, "elapsed_time": "3h 53m 4s", "remaining_time": "7h 43m 44s", "loss_scale": 1.0, "consumed_samples": 544768, "global_step/max_steps": "2128/6362"} +{"lm loss": 5.02468777, "grad_norm": 0.90496159, "learning_rate": 8.005e-05, "elapsed_time_per_iteration": 6.58464575, "memory(GiB)": 21.51, "elapsed_time": "3h 53m 10s", "remaining_time": "7h 43m 37s", "loss_scale": 1.0, "consumed_samples": 545024, "global_step/max_steps": "2129/6362"} +{"lm loss": 5.00620556, "grad_norm": 0.7876659, "learning_rate": 8.003e-05, "elapsed_time_per_iteration": 6.78318071, "memory(GiB)": 21.51, "elapsed_time": "3h 53m 17s", "remaining_time": "7h 43m 31s", "loss_scale": 1.0, "consumed_samples": 545280, "global_step/max_steps": "2130/6362"} +{"lm loss": 5.03333282, "grad_norm": 0.69196111, "learning_rate": 8.001e-05, "elapsed_time_per_iteration": 6.75867248, "memory(GiB)": 21.51, "elapsed_time": "3h 53m 24s", "remaining_time": "7h 43m 25s", "loss_scale": 1.0, "consumed_samples": 545536, "global_step/max_steps": "2131/6362"} +{"lm loss": 5.00197792, "grad_norm": 0.77296847, "learning_rate": 7.999e-05, "elapsed_time_per_iteration": 6.90634894, "memory(GiB)": 21.51, "elapsed_time": "3h 53m 31s", "remaining_time": "7h 43m 19s", "loss_scale": 1.0, "consumed_samples": 545792, "global_step/max_steps": "2132/6362"} +{"lm loss": 5.02244759, "grad_norm": 0.79861414, "learning_rate": 7.997e-05, "elapsed_time_per_iteration": 6.57436872, "memory(GiB)": 21.51, "elapsed_time": "3h 53m 37s", "remaining_time": "7h 43m 12s", "loss_scale": 1.0, "consumed_samples": 546048, "global_step/max_steps": "2133/6362"} +{"lm loss": 5.01963663, "grad_norm": 0.5924933, "learning_rate": 7.995e-05, "elapsed_time_per_iteration": 6.55893803, "memory(GiB)": 21.51, "elapsed_time": "3h 53m 44s", "remaining_time": "7h 43m 6s", "loss_scale": 1.0, "consumed_samples": 546304, "global_step/max_steps": "2134/6362"} +{"lm loss": 5.00865555, "grad_norm": 0.53051543, "learning_rate": 7.993e-05, "elapsed_time_per_iteration": 6.67192602, "memory(GiB)": 21.51, "elapsed_time": "3h 53m 51s", "remaining_time": "7h 42m 59s", "loss_scale": 1.0, "consumed_samples": 546560, "global_step/max_steps": "2135/6362"} +{"lm loss": 5.02355671, "grad_norm": 0.62458533, "learning_rate": 7.991e-05, "elapsed_time_per_iteration": 6.71605253, "memory(GiB)": 21.51, "elapsed_time": "3h 53m 57s", "remaining_time": "7h 42m 53s", "loss_scale": 1.0, "consumed_samples": 546816, "global_step/max_steps": "2136/6362"} +{"lm loss": 5.00426054, "grad_norm": 0.78470594, "learning_rate": 7.989e-05, "elapsed_time_per_iteration": 6.87072539, "memory(GiB)": 21.51, "elapsed_time": "3h 54m 4s", "remaining_time": "7h 42m 47s", "loss_scale": 1.0, "consumed_samples": 547072, "global_step/max_steps": "2137/6362"} +{"lm loss": 5.02957964, "grad_norm": 0.83385563, "learning_rate": 7.987e-05, "elapsed_time_per_iteration": 6.79501247, "memory(GiB)": 21.51, "elapsed_time": "3h 54m 11s", "remaining_time": "7h 42m 41s", "loss_scale": 1.0, "consumed_samples": 547328, "global_step/max_steps": "2138/6362"} +{"lm loss": 5.02191925, "grad_norm": 0.79514647, "learning_rate": 7.985e-05, "elapsed_time_per_iteration": 6.47227907, "memory(GiB)": 21.51, "elapsed_time": "3h 54m 17s", "remaining_time": "7h 42m 34s", "loss_scale": 1.0, "consumed_samples": 547584, "global_step/max_steps": "2139/6362"} +{"lm loss": 5.0290575, "grad_norm": 0.77151525, "learning_rate": 7.983e-05, "elapsed_time_per_iteration": 6.44976234, "memory(GiB)": 21.51, "elapsed_time": "3h 54m 24s", "remaining_time": "7h 42m 27s", "loss_scale": 1.0, "consumed_samples": 547840, "global_step/max_steps": "2140/6362"} +{"lm loss": 5.01278067, "grad_norm": 0.74216694, "learning_rate": 7.981e-05, "elapsed_time_per_iteration": 6.65524316, "memory(GiB)": 21.51, "elapsed_time": "3h 54m 31s", "remaining_time": "7h 42m 21s", "loss_scale": 1.0, "consumed_samples": 548096, "global_step/max_steps": "2141/6362"} +{"lm loss": 5.01337337, "grad_norm": 0.67802298, "learning_rate": 7.979e-05, "elapsed_time_per_iteration": 6.56139898, "memory(GiB)": 21.51, "elapsed_time": "3h 54m 37s", "remaining_time": "7h 42m 14s", "loss_scale": 1.0, "consumed_samples": 548352, "global_step/max_steps": "2142/6362"} +{"lm loss": 5.01225567, "grad_norm": 0.64585024, "learning_rate": 7.977e-05, "elapsed_time_per_iteration": 6.68942118, "memory(GiB)": 21.51, "elapsed_time": "3h 54m 44s", "remaining_time": "7h 42m 8s", "loss_scale": 1.0, "consumed_samples": 548608, "global_step/max_steps": "2143/6362"} +{"lm loss": 5.00954723, "grad_norm": 0.81025571, "learning_rate": 7.975e-05, "elapsed_time_per_iteration": 6.64818835, "memory(GiB)": 21.51, "elapsed_time": "3h 54m 50s", "remaining_time": "7h 42m 1s", "loss_scale": 1.0, "consumed_samples": 548864, "global_step/max_steps": "2144/6362"} +{"lm loss": 5.00505304, "grad_norm": 0.81134105, "learning_rate": 7.973e-05, "elapsed_time_per_iteration": 6.62234783, "memory(GiB)": 21.51, "elapsed_time": "3h 54m 57s", "remaining_time": "7h 41m 55s", "loss_scale": 1.0, "consumed_samples": 549120, "global_step/max_steps": "2145/6362"} +{"lm loss": 4.98826075, "grad_norm": 0.66208535, "learning_rate": 7.971e-05, "elapsed_time_per_iteration": 6.75410032, "memory(GiB)": 21.51, "elapsed_time": "3h 55m 4s", "remaining_time": "7h 41m 49s", "loss_scale": 1.0, "consumed_samples": 549376, "global_step/max_steps": "2146/6362"} +{"lm loss": 4.99556923, "grad_norm": 0.61061215, "learning_rate": 7.969e-05, "elapsed_time_per_iteration": 6.74755907, "memory(GiB)": 21.51, "elapsed_time": "3h 55m 11s", "remaining_time": "7h 41m 43s", "loss_scale": 1.0, "consumed_samples": 549632, "global_step/max_steps": "2147/6362"} +{"lm loss": 5.01056767, "grad_norm": 0.5700286, "learning_rate": 7.967e-05, "elapsed_time_per_iteration": 6.33194518, "memory(GiB)": 21.51, "elapsed_time": "3h 55m 17s", "remaining_time": "7h 41m 35s", "loss_scale": 1.0, "consumed_samples": 549888, "global_step/max_steps": "2148/6362"} +{"lm loss": 5.02157927, "grad_norm": 0.53447258, "learning_rate": 7.964e-05, "elapsed_time_per_iteration": 6.49754858, "memory(GiB)": 21.51, "elapsed_time": "3h 55m 23s", "remaining_time": "7h 41m 29s", "loss_scale": 1.0, "consumed_samples": 550144, "global_step/max_steps": "2149/6362"} +{"lm loss": 4.98909521, "grad_norm": 0.55378795, "learning_rate": 7.962e-05, "elapsed_time_per_iteration": 6.61748624, "memory(GiB)": 21.51, "elapsed_time": "3h 55m 30s", "remaining_time": "7h 41m 22s", "loss_scale": 1.0, "consumed_samples": 550400, "global_step/max_steps": "2150/6362"} +{"lm loss": 5.02176619, "grad_norm": 0.60957801, "learning_rate": 7.96e-05, "elapsed_time_per_iteration": 6.33182359, "memory(GiB)": 21.51, "elapsed_time": "3h 55m 36s", "remaining_time": "7h 41m 15s", "loss_scale": 1.0, "consumed_samples": 550656, "global_step/max_steps": "2151/6362"} +{"lm loss": 5.02918863, "grad_norm": 0.6003167, "learning_rate": 7.958e-05, "elapsed_time_per_iteration": 6.51113272, "memory(GiB)": 21.51, "elapsed_time": "3h 55m 43s", "remaining_time": "7h 41m 9s", "loss_scale": 1.0, "consumed_samples": 550912, "global_step/max_steps": "2152/6362"} +{"lm loss": 5.02051735, "grad_norm": 0.59337831, "learning_rate": 7.956e-05, "elapsed_time_per_iteration": 6.50980043, "memory(GiB)": 21.51, "elapsed_time": "3h 55m 49s", "remaining_time": "7h 41m 2s", "loss_scale": 1.0, "consumed_samples": 551168, "global_step/max_steps": "2153/6362"} +{"lm loss": 5.00270224, "grad_norm": 0.62762213, "learning_rate": 7.954e-05, "elapsed_time_per_iteration": 6.40193605, "memory(GiB)": 21.51, "elapsed_time": "3h 55m 56s", "remaining_time": "7h 40m 55s", "loss_scale": 1.0, "consumed_samples": 551424, "global_step/max_steps": "2154/6362"} +{"lm loss": 4.99281549, "grad_norm": 0.69543058, "learning_rate": 7.952e-05, "elapsed_time_per_iteration": 6.64179349, "memory(GiB)": 21.51, "elapsed_time": "3h 56m 2s", "remaining_time": "7h 40m 48s", "loss_scale": 1.0, "consumed_samples": 551680, "global_step/max_steps": "2155/6362"} +{"lm loss": 5.02203417, "grad_norm": 0.59580272, "learning_rate": 7.95e-05, "elapsed_time_per_iteration": 6.41773701, "memory(GiB)": 21.51, "elapsed_time": "3h 56m 9s", "remaining_time": "7h 40m 42s", "loss_scale": 1.0, "consumed_samples": 551936, "global_step/max_steps": "2156/6362"} +{"lm loss": 4.99998808, "grad_norm": 0.53014296, "learning_rate": 7.948e-05, "elapsed_time_per_iteration": 6.52550817, "memory(GiB)": 21.51, "elapsed_time": "3h 56m 15s", "remaining_time": "7h 40m 35s", "loss_scale": 1.0, "consumed_samples": 552192, "global_step/max_steps": "2157/6362"} +{"lm loss": 5.00105476, "grad_norm": 0.55074996, "learning_rate": 7.946e-05, "elapsed_time_per_iteration": 6.41024852, "memory(GiB)": 21.51, "elapsed_time": "3h 56m 22s", "remaining_time": "7h 40m 28s", "loss_scale": 1.0, "consumed_samples": 552448, "global_step/max_steps": "2158/6362"} +{"lm loss": 4.98673487, "grad_norm": 0.54370344, "learning_rate": 7.944e-05, "elapsed_time_per_iteration": 6.40698767, "memory(GiB)": 21.51, "elapsed_time": "3h 56m 28s", "remaining_time": "7h 40m 21s", "loss_scale": 1.0, "consumed_samples": 552704, "global_step/max_steps": "2159/6362"} +{"lm loss": 5.01548767, "grad_norm": 0.56937915, "learning_rate": 7.942e-05, "elapsed_time_per_iteration": 6.51631999, "memory(GiB)": 21.51, "elapsed_time": "3h 56m 35s", "remaining_time": "7h 40m 14s", "loss_scale": 1.0, "consumed_samples": 552960, "global_step/max_steps": "2160/6362"} +{"lm loss": 5.00317812, "grad_norm": 0.62434137, "learning_rate": 7.94e-05, "elapsed_time_per_iteration": 6.69709396, "memory(GiB)": 21.51, "elapsed_time": "3h 56m 41s", "remaining_time": "7h 40m 8s", "loss_scale": 1.0, "consumed_samples": 553216, "global_step/max_steps": "2161/6362"} +{"lm loss": 4.97549009, "grad_norm": 0.6374585, "learning_rate": 7.938e-05, "elapsed_time_per_iteration": 6.38348365, "memory(GiB)": 21.51, "elapsed_time": "3h 56m 48s", "remaining_time": "7h 40m 1s", "loss_scale": 1.0, "consumed_samples": 553472, "global_step/max_steps": "2162/6362"} +{"lm loss": 4.99185133, "grad_norm": 0.53902048, "learning_rate": 7.936e-05, "elapsed_time_per_iteration": 6.4351697, "memory(GiB)": 21.51, "elapsed_time": "3h 56m 54s", "remaining_time": "7h 39m 54s", "loss_scale": 1.0, "consumed_samples": 553728, "global_step/max_steps": "2163/6362"} +{"lm loss": 5.01003218, "grad_norm": 0.53685915, "learning_rate": 7.934e-05, "elapsed_time_per_iteration": 6.94215679, "memory(GiB)": 21.51, "elapsed_time": "3h 57m 1s", "remaining_time": "7h 39m 49s", "loss_scale": 1.0, "consumed_samples": 553984, "global_step/max_steps": "2164/6362"} +{"lm loss": 5.0015769, "grad_norm": 0.59667271, "learning_rate": 7.932e-05, "elapsed_time_per_iteration": 6.60938478, "memory(GiB)": 21.51, "elapsed_time": "3h 57m 8s", "remaining_time": "7h 39m 42s", "loss_scale": 1.0, "consumed_samples": 554240, "global_step/max_steps": "2165/6362"} +{"lm loss": 5.01250982, "grad_norm": 0.66688251, "learning_rate": 7.929e-05, "elapsed_time_per_iteration": 6.49230361, "memory(GiB)": 21.51, "elapsed_time": "3h 57m 14s", "remaining_time": "7h 39m 35s", "loss_scale": 1.0, "consumed_samples": 554496, "global_step/max_steps": "2166/6362"} +{"lm loss": 4.99133921, "grad_norm": 0.80256724, "learning_rate": 7.927e-05, "elapsed_time_per_iteration": 6.77305055, "memory(GiB)": 21.51, "elapsed_time": "3h 57m 21s", "remaining_time": "7h 39m 29s", "loss_scale": 1.0, "consumed_samples": 554752, "global_step/max_steps": "2167/6362"} +{"lm loss": 5.01305437, "grad_norm": 0.96435422, "learning_rate": 7.925e-05, "elapsed_time_per_iteration": 6.38316989, "memory(GiB)": 21.51, "elapsed_time": "3h 57m 27s", "remaining_time": "7h 39m 22s", "loss_scale": 1.0, "consumed_samples": 555008, "global_step/max_steps": "2168/6362"} +{"lm loss": 4.99439144, "grad_norm": 1.03108597, "learning_rate": 7.923e-05, "elapsed_time_per_iteration": 6.53942657, "memory(GiB)": 21.51, "elapsed_time": "3h 57m 34s", "remaining_time": "7h 39m 16s", "loss_scale": 1.0, "consumed_samples": 555264, "global_step/max_steps": "2169/6362"} +{"lm loss": 5.01439762, "grad_norm": 0.84523529, "learning_rate": 7.921e-05, "elapsed_time_per_iteration": 6.34157467, "memory(GiB)": 21.51, "elapsed_time": "3h 57m 40s", "remaining_time": "7h 39m 9s", "loss_scale": 1.0, "consumed_samples": 555520, "global_step/max_steps": "2170/6362"} +{"lm loss": 4.98386288, "grad_norm": 0.84370637, "learning_rate": 7.919e-05, "elapsed_time_per_iteration": 6.61831474, "memory(GiB)": 21.51, "elapsed_time": "3h 57m 47s", "remaining_time": "7h 39m 2s", "loss_scale": 1.0, "consumed_samples": 555776, "global_step/max_steps": "2171/6362"} +{"lm loss": 5.01647139, "grad_norm": 0.90859181, "learning_rate": 7.917e-05, "elapsed_time_per_iteration": 6.4787159, "memory(GiB)": 21.51, "elapsed_time": "3h 57m 53s", "remaining_time": "7h 38m 55s", "loss_scale": 1.0, "consumed_samples": 556032, "global_step/max_steps": "2172/6362"} +{"lm loss": 5.01709652, "grad_norm": 1.01613212, "learning_rate": 7.915e-05, "elapsed_time_per_iteration": 6.55682421, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 0s", "remaining_time": "7h 38m 49s", "loss_scale": 1.0, "consumed_samples": 556288, "global_step/max_steps": "2173/6362"} +{"lm loss": 5.0227623, "grad_norm": 0.89189404, "learning_rate": 7.913e-05, "elapsed_time_per_iteration": 6.71695471, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 7s", "remaining_time": "7h 38m 42s", "loss_scale": 1.0, "consumed_samples": 556544, "global_step/max_steps": "2174/6362"} +{"lm loss": 4.99324989, "grad_norm": 0.74246895, "learning_rate": 7.911e-05, "elapsed_time_per_iteration": 6.65795922, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 13s", "remaining_time": "7h 38m 36s", "loss_scale": 1.0, "consumed_samples": 556800, "global_step/max_steps": "2175/6362"} +{"lm loss": 4.99411154, "grad_norm": 0.77215666, "learning_rate": 7.909e-05, "elapsed_time_per_iteration": 6.37712669, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 20s", "remaining_time": "7h 38m 29s", "loss_scale": 1.0, "consumed_samples": 557056, "global_step/max_steps": "2176/6362"} +{"lm loss": 4.96966314, "grad_norm": 0.73121864, "learning_rate": 7.907e-05, "elapsed_time_per_iteration": 6.34732509, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 26s", "remaining_time": "7h 38m 22s", "loss_scale": 1.0, "consumed_samples": 557312, "global_step/max_steps": "2177/6362"} +{"lm loss": 4.99012804, "grad_norm": 0.64499933, "learning_rate": 7.905e-05, "elapsed_time_per_iteration": 6.587322, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 33s", "remaining_time": "7h 38m 16s", "loss_scale": 1.0, "consumed_samples": 557568, "global_step/max_steps": "2178/6362"} +{"lm loss": 5.00562, "grad_norm": 0.59402126, "learning_rate": 7.903e-05, "elapsed_time_per_iteration": 6.45392323, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 39s", "remaining_time": "7h 38m 9s", "loss_scale": 1.0, "consumed_samples": 557824, "global_step/max_steps": "2179/6362"} +{"lm loss": 5.00054598, "grad_norm": 0.59424275, "learning_rate": 7.9e-05, "elapsed_time_per_iteration": 6.51486802, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 46s", "remaining_time": "7h 38m 2s", "loss_scale": 1.0, "consumed_samples": 558080, "global_step/max_steps": "2180/6362"} +{"lm loss": 4.99754906, "grad_norm": 0.5820694, "learning_rate": 7.898e-05, "elapsed_time_per_iteration": 6.70509934, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 52s", "remaining_time": "7h 37m 56s", "loss_scale": 1.0, "consumed_samples": 558336, "global_step/max_steps": "2181/6362"} +{"lm loss": 5.00906467, "grad_norm": 0.63943064, "learning_rate": 7.896e-05, "elapsed_time_per_iteration": 6.47242212, "memory(GiB)": 21.51, "elapsed_time": "3h 58m 59s", "remaining_time": "7h 37m 49s", "loss_scale": 1.0, "consumed_samples": 558592, "global_step/max_steps": "2182/6362"} +{"lm loss": 5.03166628, "grad_norm": 0.58491206, "learning_rate": 7.894e-05, "elapsed_time_per_iteration": 6.5604198, "memory(GiB)": 21.51, "elapsed_time": "3h 59m 5s", "remaining_time": "7h 37m 42s", "loss_scale": 1.0, "consumed_samples": 558848, "global_step/max_steps": "2183/6362"} +{"lm loss": 5.00647163, "grad_norm": 0.57279789, "learning_rate": 7.892e-05, "elapsed_time_per_iteration": 6.57467604, "memory(GiB)": 21.51, "elapsed_time": "3h 59m 12s", "remaining_time": "7h 37m 36s", "loss_scale": 1.0, "consumed_samples": 559104, "global_step/max_steps": "2184/6362"} +{"lm loss": 5.01322269, "grad_norm": 0.59786379, "learning_rate": 7.89e-05, "elapsed_time_per_iteration": 6.40229034, "memory(GiB)": 21.51, "elapsed_time": "3h 59m 18s", "remaining_time": "7h 37m 29s", "loss_scale": 1.0, "consumed_samples": 559360, "global_step/max_steps": "2185/6362"} +{"lm loss": 5.01924086, "grad_norm": 0.60721469, "learning_rate": 7.888e-05, "elapsed_time_per_iteration": 6.62036204, "memory(GiB)": 21.51, "elapsed_time": "3h 59m 25s", "remaining_time": "7h 37m 22s", "loss_scale": 1.0, "consumed_samples": 559616, "global_step/max_steps": "2186/6362"} +{"lm loss": 4.99560022, "grad_norm": 0.60912865, "learning_rate": 7.886e-05, "elapsed_time_per_iteration": 6.45159793, "memory(GiB)": 21.51, "elapsed_time": "3h 59m 31s", "remaining_time": "7h 37m 16s", "loss_scale": 1.0, "consumed_samples": 559872, "global_step/max_steps": "2187/6362"} +{"lm loss": 5.00811958, "grad_norm": 0.61640203, "learning_rate": 7.884e-05, "elapsed_time_per_iteration": 6.59172177, "memory(GiB)": 21.51, "elapsed_time": "3h 59m 38s", "remaining_time": "7h 37m 9s", "loss_scale": 1.0, "consumed_samples": 560128, "global_step/max_steps": "2188/6362"} +{"lm loss": 4.9789753, "grad_norm": 0.58776242, "learning_rate": 7.882e-05, "elapsed_time_per_iteration": 6.58727479, "memory(GiB)": 21.51, "elapsed_time": "3h 59m 45s", "remaining_time": "7h 37m 3s", "loss_scale": 1.0, "consumed_samples": 560384, "global_step/max_steps": "2189/6362"} +{"lm loss": 5.03157282, "grad_norm": 0.6010282, "learning_rate": 7.88e-05, "elapsed_time_per_iteration": 6.67402625, "memory(GiB)": 21.51, "elapsed_time": "3h 59m 51s", "remaining_time": "7h 36m 56s", "loss_scale": 1.0, "consumed_samples": 560640, "global_step/max_steps": "2190/6362"} +{"lm loss": 4.99350262, "grad_norm": 0.68955415, "learning_rate": 7.878e-05, "elapsed_time_per_iteration": 6.41922903, "memory(GiB)": 21.51, "elapsed_time": "3h 59m 58s", "remaining_time": "7h 36m 49s", "loss_scale": 1.0, "consumed_samples": 560896, "global_step/max_steps": "2191/6362"} +{"lm loss": 5.01812983, "grad_norm": 0.73671991, "learning_rate": 7.875e-05, "elapsed_time_per_iteration": 6.51575065, "memory(GiB)": 21.51, "elapsed_time": "4h 0m 4s", "remaining_time": "7h 36m 43s", "loss_scale": 1.0, "consumed_samples": 561152, "global_step/max_steps": "2192/6362"} +{"lm loss": 4.98915911, "grad_norm": 0.76051539, "learning_rate": 7.873e-05, "elapsed_time_per_iteration": 6.89974046, "memory(GiB)": 21.51, "elapsed_time": "4h 0m 11s", "remaining_time": "7h 36m 37s", "loss_scale": 1.0, "consumed_samples": 561408, "global_step/max_steps": "2193/6362"} +{"lm loss": 5.00501299, "grad_norm": 0.81798512, "learning_rate": 7.871e-05, "elapsed_time_per_iteration": 6.61483455, "memory(GiB)": 21.51, "elapsed_time": "4h 0m 18s", "remaining_time": "7h 36m 30s", "loss_scale": 1.0, "consumed_samples": 561664, "global_step/max_steps": "2194/6362"} +{"lm loss": 4.98460102, "grad_norm": 0.82071579, "learning_rate": 7.869e-05, "elapsed_time_per_iteration": 6.3768115, "memory(GiB)": 21.51, "elapsed_time": "4h 0m 24s", "remaining_time": "7h 36m 23s", "loss_scale": 1.0, "consumed_samples": 561920, "global_step/max_steps": "2195/6362"} +{"lm loss": 5.0104599, "grad_norm": 0.77797836, "learning_rate": 7.867e-05, "elapsed_time_per_iteration": 6.65298963, "memory(GiB)": 21.51, "elapsed_time": "4h 0m 31s", "remaining_time": "7h 36m 17s", "loss_scale": 1.0, "consumed_samples": 562176, "global_step/max_steps": "2196/6362"} +{"lm loss": 4.99202728, "grad_norm": 0.75446892, "learning_rate": 7.865e-05, "elapsed_time_per_iteration": 6.55285215, "memory(GiB)": 21.51, "elapsed_time": "4h 0m 37s", "remaining_time": "7h 36m 10s", "loss_scale": 1.0, "consumed_samples": 562432, "global_step/max_steps": "2197/6362"} +{"lm loss": 5.0098443, "grad_norm": 0.74197584, "learning_rate": 7.863e-05, "elapsed_time_per_iteration": 6.46485639, "memory(GiB)": 21.51, "elapsed_time": "4h 0m 44s", "remaining_time": "7h 36m 3s", "loss_scale": 1.0, "consumed_samples": 562688, "global_step/max_steps": "2198/6362"} +{"lm loss": 5.0043211, "grad_norm": 0.82502967, "learning_rate": 7.861e-05, "elapsed_time_per_iteration": 6.7162776, "memory(GiB)": 21.51, "elapsed_time": "4h 0m 50s", "remaining_time": "7h 35m 57s", "loss_scale": 1.0, "consumed_samples": 562944, "global_step/max_steps": "2199/6362"} +{"lm loss": 5.01478481, "grad_norm": 0.66300398, "learning_rate": 7.859e-05, "elapsed_time_per_iteration": 6.51698279, "memory(GiB)": 21.51, "elapsed_time": "4h 0m 57s", "remaining_time": "7h 35m 50s", "loss_scale": 1.0, "consumed_samples": 563200, "global_step/max_steps": "2200/6362"} +{"lm loss": 4.99402142, "grad_norm": 0.62640709, "learning_rate": 7.857e-05, "elapsed_time_per_iteration": 6.48752356, "memory(GiB)": 21.51, "elapsed_time": "4h 1m 3s", "remaining_time": "7h 35m 44s", "loss_scale": 1.0, "consumed_samples": 563456, "global_step/max_steps": "2201/6362"} +{"lm loss": 4.99598551, "grad_norm": 0.64097887, "learning_rate": 7.855e-05, "elapsed_time_per_iteration": 6.35858703, "memory(GiB)": 21.51, "elapsed_time": "4h 1m 10s", "remaining_time": "7h 35m 37s", "loss_scale": 1.0, "consumed_samples": 563712, "global_step/max_steps": "2202/6362"} +{"lm loss": 4.98943806, "grad_norm": 0.64732867, "learning_rate": 7.853e-05, "elapsed_time_per_iteration": 6.39708638, "memory(GiB)": 21.51, "elapsed_time": "4h 1m 16s", "remaining_time": "7h 35m 30s", "loss_scale": 1.0, "consumed_samples": 563968, "global_step/max_steps": "2203/6362"} +{"lm loss": 5.00119829, "grad_norm": 0.69535202, "learning_rate": 7.85e-05, "elapsed_time_per_iteration": 6.56554556, "memory(GiB)": 21.51, "elapsed_time": "4h 1m 23s", "remaining_time": "7h 35m 23s", "loss_scale": 1.0, "consumed_samples": 564224, "global_step/max_steps": "2204/6362"} +{"lm loss": 5.0293231, "grad_norm": 0.55910879, "learning_rate": 7.848e-05, "elapsed_time_per_iteration": 6.73146319, "memory(GiB)": 21.51, "elapsed_time": "4h 1m 30s", "remaining_time": "7h 35m 17s", "loss_scale": 1.0, "consumed_samples": 564480, "global_step/max_steps": "2205/6362"} +{"lm loss": 4.99045801, "grad_norm": 0.63202703, "learning_rate": 7.846e-05, "elapsed_time_per_iteration": 6.40749216, "memory(GiB)": 21.51, "elapsed_time": "4h 1m 36s", "remaining_time": "7h 35m 10s", "loss_scale": 1.0, "consumed_samples": 564736, "global_step/max_steps": "2206/6362"} +{"lm loss": 5.01331139, "grad_norm": 0.57895303, "learning_rate": 7.844e-05, "elapsed_time_per_iteration": 6.45638442, "memory(GiB)": 21.51, "elapsed_time": "4h 1m 42s", "remaining_time": "7h 35m 3s", "loss_scale": 1.0, "consumed_samples": 564992, "global_step/max_steps": "2207/6362"} +{"lm loss": 4.97276163, "grad_norm": 0.63851088, "learning_rate": 7.842e-05, "elapsed_time_per_iteration": 6.48355484, "memory(GiB)": 21.51, "elapsed_time": "4h 1m 49s", "remaining_time": "7h 34m 57s", "loss_scale": 1.0, "consumed_samples": 565248, "global_step/max_steps": "2208/6362"} +{"lm loss": 5.00362444, "grad_norm": 0.64446884, "learning_rate": 7.84e-05, "elapsed_time_per_iteration": 6.80166912, "memory(GiB)": 21.51, "elapsed_time": "4h 1m 56s", "remaining_time": "7h 34m 50s", "loss_scale": 1.0, "consumed_samples": 565504, "global_step/max_steps": "2209/6362"} +{"lm loss": 5.00953388, "grad_norm": 0.57945359, "learning_rate": 7.838e-05, "elapsed_time_per_iteration": 6.66907072, "memory(GiB)": 21.51, "elapsed_time": "4h 2m 2s", "remaining_time": "7h 34m 44s", "loss_scale": 1.0, "consumed_samples": 565760, "global_step/max_steps": "2210/6362"} +{"lm loss": 4.98296976, "grad_norm": 0.66507906, "learning_rate": 7.836e-05, "elapsed_time_per_iteration": 6.68006849, "memory(GiB)": 21.51, "elapsed_time": "4h 2m 9s", "remaining_time": "7h 34m 38s", "loss_scale": 1.0, "consumed_samples": 566016, "global_step/max_steps": "2211/6362"} +{"lm loss": 5.02024364, "grad_norm": 0.69320494, "learning_rate": 7.834e-05, "elapsed_time_per_iteration": 6.43107176, "memory(GiB)": 21.51, "elapsed_time": "4h 2m 15s", "remaining_time": "7h 34m 31s", "loss_scale": 1.0, "consumed_samples": 566272, "global_step/max_steps": "2212/6362"} +{"lm loss": 5.01611233, "grad_norm": 0.72100121, "learning_rate": 7.832e-05, "elapsed_time_per_iteration": 6.55561519, "memory(GiB)": 21.51, "elapsed_time": "4h 2m 22s", "remaining_time": "7h 34m 24s", "loss_scale": 1.0, "consumed_samples": 566528, "global_step/max_steps": "2213/6362"} +{"lm loss": 4.98430777, "grad_norm": 0.68884474, "learning_rate": 7.829e-05, "elapsed_time_per_iteration": 6.58646917, "memory(GiB)": 21.51, "elapsed_time": "4h 2m 29s", "remaining_time": "7h 34m 18s", "loss_scale": 1.0, "consumed_samples": 566784, "global_step/max_steps": "2214/6362"} +{"lm loss": 5.02714777, "grad_norm": 0.75981379, "learning_rate": 7.827e-05, "elapsed_time_per_iteration": 6.54760385, "memory(GiB)": 21.51, "elapsed_time": "4h 2m 35s", "remaining_time": "7h 34m 11s", "loss_scale": 1.0, "consumed_samples": 567040, "global_step/max_steps": "2215/6362"} +{"lm loss": 5.01948261, "grad_norm": 0.69992411, "learning_rate": 7.825e-05, "elapsed_time_per_iteration": 6.92797875, "memory(GiB)": 21.51, "elapsed_time": "4h 2m 42s", "remaining_time": "7h 34m 5s", "loss_scale": 1.0, "consumed_samples": 567296, "global_step/max_steps": "2216/6362"} +{"lm loss": 4.99469852, "grad_norm": 0.61510831, "learning_rate": 7.823e-05, "elapsed_time_per_iteration": 6.53407192, "memory(GiB)": 21.51, "elapsed_time": "4h 2m 49s", "remaining_time": "7h 33m 59s", "loss_scale": 1.0, "consumed_samples": 567552, "global_step/max_steps": "2217/6362"} +{"lm loss": 5.00849009, "grad_norm": 0.62922704, "learning_rate": 7.821e-05, "elapsed_time_per_iteration": 6.77769136, "memory(GiB)": 21.51, "elapsed_time": "4h 2m 55s", "remaining_time": "7h 33m 52s", "loss_scale": 1.0, "consumed_samples": 567808, "global_step/max_steps": "2218/6362"} +{"lm loss": 5.00170708, "grad_norm": 0.63994527, "learning_rate": 7.819e-05, "elapsed_time_per_iteration": 6.67612267, "memory(GiB)": 21.51, "elapsed_time": "4h 3m 2s", "remaining_time": "7h 33m 46s", "loss_scale": 1.0, "consumed_samples": 568064, "global_step/max_steps": "2219/6362"} +{"lm loss": 4.9788084, "grad_norm": 0.70068872, "learning_rate": 7.817e-05, "elapsed_time_per_iteration": 6.56384921, "memory(GiB)": 21.51, "elapsed_time": "4h 3m 9s", "remaining_time": "7h 33m 39s", "loss_scale": 1.0, "consumed_samples": 568320, "global_step/max_steps": "2220/6362"} +{"lm loss": 4.98592758, "grad_norm": 0.69412661, "learning_rate": 7.815e-05, "elapsed_time_per_iteration": 6.67214251, "memory(GiB)": 21.51, "elapsed_time": "4h 3m 15s", "remaining_time": "7h 33m 33s", "loss_scale": 1.0, "consumed_samples": 568576, "global_step/max_steps": "2221/6362"} +{"lm loss": 4.9937501, "grad_norm": 0.67879426, "learning_rate": 7.813e-05, "elapsed_time_per_iteration": 6.77142334, "memory(GiB)": 21.51, "elapsed_time": "4h 3m 22s", "remaining_time": "7h 33m 27s", "loss_scale": 1.0, "consumed_samples": 568832, "global_step/max_steps": "2222/6362"} +{"lm loss": 4.99704933, "grad_norm": 0.72317958, "learning_rate": 7.811e-05, "elapsed_time_per_iteration": 6.48243403, "memory(GiB)": 21.51, "elapsed_time": "4h 3m 29s", "remaining_time": "7h 33m 20s", "loss_scale": 1.0, "consumed_samples": 569088, "global_step/max_steps": "2223/6362"} +{"lm loss": 5.00685358, "grad_norm": 0.8109712, "learning_rate": 7.808e-05, "elapsed_time_per_iteration": 6.68675923, "memory(GiB)": 21.51, "elapsed_time": "4h 3m 35s", "remaining_time": "7h 33m 14s", "loss_scale": 1.0, "consumed_samples": 569344, "global_step/max_steps": "2224/6362"} +{"lm loss": 5.00861692, "grad_norm": 0.74672681, "learning_rate": 7.806e-05, "elapsed_time_per_iteration": 6.69920778, "memory(GiB)": 21.51, "elapsed_time": "4h 3m 42s", "remaining_time": "7h 33m 7s", "loss_scale": 1.0, "consumed_samples": 569600, "global_step/max_steps": "2225/6362"} +{"lm loss": 5.01147699, "grad_norm": 0.61034596, "learning_rate": 7.804e-05, "elapsed_time_per_iteration": 6.3891921, "memory(GiB)": 21.51, "elapsed_time": "4h 3m 48s", "remaining_time": "7h 33m 1s", "loss_scale": 1.0, "consumed_samples": 569856, "global_step/max_steps": "2226/6362"} +{"lm loss": 5.0074625, "grad_norm": 0.56889975, "learning_rate": 7.802e-05, "elapsed_time_per_iteration": 6.53358245, "memory(GiB)": 21.51, "elapsed_time": "4h 3m 55s", "remaining_time": "7h 32m 54s", "loss_scale": 1.0, "consumed_samples": 570112, "global_step/max_steps": "2227/6362"} +{"lm loss": 5.00309992, "grad_norm": 0.62261963, "learning_rate": 7.8e-05, "elapsed_time_per_iteration": 6.39469981, "memory(GiB)": 21.51, "elapsed_time": "4h 4m 1s", "remaining_time": "7h 32m 47s", "loss_scale": 1.0, "consumed_samples": 570368, "global_step/max_steps": "2228/6362"} +{"lm loss": 4.99544907, "grad_norm": 0.62080008, "learning_rate": 7.798e-05, "elapsed_time_per_iteration": 6.51844859, "memory(GiB)": 21.51, "elapsed_time": "4h 4m 8s", "remaining_time": "7h 32m 40s", "loss_scale": 1.0, "consumed_samples": 570624, "global_step/max_steps": "2229/6362"} +{"lm loss": 5.0166378, "grad_norm": 0.51641256, "learning_rate": 7.796e-05, "elapsed_time_per_iteration": 6.49093318, "memory(GiB)": 21.51, "elapsed_time": "4h 4m 14s", "remaining_time": "7h 32m 34s", "loss_scale": 1.0, "consumed_samples": 570880, "global_step/max_steps": "2230/6362"} +{"lm loss": 4.99117517, "grad_norm": 0.53274208, "learning_rate": 7.794e-05, "elapsed_time_per_iteration": 6.45638847, "memory(GiB)": 21.51, "elapsed_time": "4h 4m 21s", "remaining_time": "7h 32m 27s", "loss_scale": 1.0, "consumed_samples": 571136, "global_step/max_steps": "2231/6362"} +{"lm loss": 4.99860668, "grad_norm": 0.51629335, "learning_rate": 7.792e-05, "elapsed_time_per_iteration": 6.42711616, "memory(GiB)": 21.51, "elapsed_time": "4h 4m 27s", "remaining_time": "7h 32m 20s", "loss_scale": 1.0, "consumed_samples": 571392, "global_step/max_steps": "2232/6362"} +{"lm loss": 5.01982546, "grad_norm": 0.54316229, "learning_rate": 7.789e-05, "elapsed_time_per_iteration": 6.50143456, "memory(GiB)": 21.51, "elapsed_time": "4h 4m 34s", "remaining_time": "7h 32m 13s", "loss_scale": 1.0, "consumed_samples": 571648, "global_step/max_steps": "2233/6362"} +{"lm loss": 4.99407387, "grad_norm": 0.63358206, "learning_rate": 7.787e-05, "elapsed_time_per_iteration": 6.46118712, "memory(GiB)": 21.51, "elapsed_time": "4h 4m 40s", "remaining_time": "7h 32m 6s", "loss_scale": 1.0, "consumed_samples": 571904, "global_step/max_steps": "2234/6362"} +{"lm loss": 5.01844454, "grad_norm": 0.62573171, "learning_rate": 7.785e-05, "elapsed_time_per_iteration": 6.48524714, "memory(GiB)": 21.51, "elapsed_time": "4h 4m 47s", "remaining_time": "7h 32m 0s", "loss_scale": 1.0, "consumed_samples": 572160, "global_step/max_steps": "2235/6362"} +{"lm loss": 5.0036025, "grad_norm": 0.66217083, "learning_rate": 7.783e-05, "elapsed_time_per_iteration": 6.76796436, "memory(GiB)": 21.51, "elapsed_time": "4h 4m 53s", "remaining_time": "7h 31m 54s", "loss_scale": 1.0, "consumed_samples": 572416, "global_step/max_steps": "2236/6362"} +{"lm loss": 5.01331568, "grad_norm": 0.69387281, "learning_rate": 7.781e-05, "elapsed_time_per_iteration": 6.5948782, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 0s", "remaining_time": "7h 31m 47s", "loss_scale": 1.0, "consumed_samples": 572672, "global_step/max_steps": "2237/6362"} +{"lm loss": 4.98057318, "grad_norm": 0.67871881, "learning_rate": 7.779e-05, "elapsed_time_per_iteration": 6.47685409, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 6s", "remaining_time": "7h 31m 40s", "loss_scale": 1.0, "consumed_samples": 572928, "global_step/max_steps": "2238/6362"} +{"lm loss": 4.99980497, "grad_norm": 0.80461794, "learning_rate": 7.777e-05, "elapsed_time_per_iteration": 6.4487927, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 13s", "remaining_time": "7h 31m 33s", "loss_scale": 1.0, "consumed_samples": 573184, "global_step/max_steps": "2239/6362"} +{"lm loss": 5.01216793, "grad_norm": 0.76950264, "learning_rate": 7.775e-05, "elapsed_time_per_iteration": 6.55601358, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 19s", "remaining_time": "7h 31m 27s", "loss_scale": 1.0, "consumed_samples": 573440, "global_step/max_steps": "2240/6362"} +{"lm loss": 4.98382139, "grad_norm": 0.60791147, "learning_rate": 7.772e-05, "elapsed_time_per_iteration": 6.6810081, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 26s", "remaining_time": "7h 31m 20s", "loss_scale": 1.0, "consumed_samples": 573696, "global_step/max_steps": "2241/6362"} +{"lm loss": 4.9784379, "grad_norm": 0.60480648, "learning_rate": 7.77e-05, "elapsed_time_per_iteration": 6.75650215, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 33s", "remaining_time": "7h 31m 14s", "loss_scale": 1.0, "consumed_samples": 573952, "global_step/max_steps": "2242/6362"} +{"lm loss": 4.98079348, "grad_norm": 0.55829036, "learning_rate": 7.768e-05, "elapsed_time_per_iteration": 6.72212887, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 40s", "remaining_time": "7h 31m 8s", "loss_scale": 1.0, "consumed_samples": 574208, "global_step/max_steps": "2243/6362"} +{"lm loss": 4.99274588, "grad_norm": 0.65241128, "learning_rate": 7.766e-05, "elapsed_time_per_iteration": 6.55209136, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 46s", "remaining_time": "7h 31m 1s", "loss_scale": 1.0, "consumed_samples": 574464, "global_step/max_steps": "2244/6362"} +{"lm loss": 5.00427103, "grad_norm": 0.67516845, "learning_rate": 7.764e-05, "elapsed_time_per_iteration": 6.45485377, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 53s", "remaining_time": "7h 30m 55s", "loss_scale": 1.0, "consumed_samples": 574720, "global_step/max_steps": "2245/6362"} +{"lm loss": 4.99265099, "grad_norm": 0.70068783, "learning_rate": 7.762e-05, "elapsed_time_per_iteration": 6.6991148, "memory(GiB)": 21.51, "elapsed_time": "4h 5m 59s", "remaining_time": "7h 30m 48s", "loss_scale": 1.0, "consumed_samples": 574976, "global_step/max_steps": "2246/6362"} +{"lm loss": 5.00986958, "grad_norm": 0.801337, "learning_rate": 7.76e-05, "elapsed_time_per_iteration": 6.3585093, "memory(GiB)": 21.51, "elapsed_time": "4h 6m 6s", "remaining_time": "7h 30m 41s", "loss_scale": 1.0, "consumed_samples": 575232, "global_step/max_steps": "2247/6362"} +{"lm loss": 5.00444031, "grad_norm": 0.82602423, "learning_rate": 7.758e-05, "elapsed_time_per_iteration": 6.5645237, "memory(GiB)": 21.51, "elapsed_time": "4h 6m 12s", "remaining_time": "7h 30m 35s", "loss_scale": 1.0, "consumed_samples": 575488, "global_step/max_steps": "2248/6362"} +{"lm loss": 5.03302288, "grad_norm": 0.81263429, "learning_rate": 7.755e-05, "elapsed_time_per_iteration": 6.53379107, "memory(GiB)": 21.51, "elapsed_time": "4h 6m 19s", "remaining_time": "7h 30m 28s", "loss_scale": 1.0, "consumed_samples": 575744, "global_step/max_steps": "2249/6362"} +{"lm loss": 5.01256704, "grad_norm": 0.74087453, "learning_rate": 7.753e-05, "elapsed_time_per_iteration": 6.61112809, "memory(GiB)": 21.51, "elapsed_time": "4h 6m 25s", "remaining_time": "7h 30m 22s", "loss_scale": 1.0, "consumed_samples": 576000, "global_step/max_steps": "2250/6362"} +{"lm loss": 5.00493956, "grad_norm": 0.72700244, "learning_rate": 7.751e-05, "elapsed_time_per_iteration": 6.4578886, "memory(GiB)": 21.51, "elapsed_time": "4h 6m 32s", "remaining_time": "7h 30m 15s", "loss_scale": 1.0, "consumed_samples": 576256, "global_step/max_steps": "2251/6362"} +{"lm loss": 5.00545883, "grad_norm": 0.67657048, "learning_rate": 7.749e-05, "elapsed_time_per_iteration": 6.61339498, "memory(GiB)": 21.51, "elapsed_time": "4h 6m 38s", "remaining_time": "7h 30m 8s", "loss_scale": 1.0, "consumed_samples": 576512, "global_step/max_steps": "2252/6362"} +{"lm loss": 5.01832485, "grad_norm": 0.66984391, "learning_rate": 7.747e-05, "elapsed_time_per_iteration": 6.58992457, "memory(GiB)": 21.51, "elapsed_time": "4h 6m 45s", "remaining_time": "7h 30m 2s", "loss_scale": 1.0, "consumed_samples": 576768, "global_step/max_steps": "2253/6362"} +{"lm loss": 4.99121761, "grad_norm": 0.70069414, "learning_rate": 7.745e-05, "elapsed_time_per_iteration": 6.53689599, "memory(GiB)": 21.51, "elapsed_time": "4h 6m 52s", "remaining_time": "7h 29m 55s", "loss_scale": 1.0, "consumed_samples": 577024, "global_step/max_steps": "2254/6362"} +{"lm loss": 4.9830308, "grad_norm": 0.71592265, "learning_rate": 7.743e-05, "elapsed_time_per_iteration": 6.67917824, "memory(GiB)": 21.51, "elapsed_time": "4h 6m 58s", "remaining_time": "7h 29m 49s", "loss_scale": 1.0, "consumed_samples": 577280, "global_step/max_steps": "2255/6362"} +{"lm loss": 5.00201082, "grad_norm": 0.71734983, "learning_rate": 7.741e-05, "elapsed_time_per_iteration": 6.53603172, "memory(GiB)": 21.51, "elapsed_time": "4h 7m 5s", "remaining_time": "7h 29m 42s", "loss_scale": 1.0, "consumed_samples": 577536, "global_step/max_steps": "2256/6362"} +{"lm loss": 5.00218868, "grad_norm": 0.71271235, "learning_rate": 7.738e-05, "elapsed_time_per_iteration": 6.64582324, "memory(GiB)": 21.51, "elapsed_time": "4h 7m 11s", "remaining_time": "7h 29m 36s", "loss_scale": 1.0, "consumed_samples": 577792, "global_step/max_steps": "2257/6362"} +{"lm loss": 5.00556231, "grad_norm": 0.64945978, "learning_rate": 7.736e-05, "elapsed_time_per_iteration": 6.53029561, "memory(GiB)": 21.51, "elapsed_time": "4h 7m 18s", "remaining_time": "7h 29m 29s", "loss_scale": 1.0, "consumed_samples": 578048, "global_step/max_steps": "2258/6362"} +{"lm loss": 4.98678398, "grad_norm": 0.639718, "learning_rate": 7.734e-05, "elapsed_time_per_iteration": 6.60208797, "memory(GiB)": 21.51, "elapsed_time": "4h 7m 25s", "remaining_time": "7h 29m 22s", "loss_scale": 1.0, "consumed_samples": 578304, "global_step/max_steps": "2259/6362"} +{"lm loss": 5.01048326, "grad_norm": 0.62056983, "learning_rate": 7.732e-05, "elapsed_time_per_iteration": 6.65831256, "memory(GiB)": 21.51, "elapsed_time": "4h 7m 31s", "remaining_time": "7h 29m 16s", "loss_scale": 1.0, "consumed_samples": 578560, "global_step/max_steps": "2260/6362"} +{"lm loss": 5.0052247, "grad_norm": 0.65302002, "learning_rate": 7.73e-05, "elapsed_time_per_iteration": 6.65607214, "memory(GiB)": 21.51, "elapsed_time": "4h 7m 38s", "remaining_time": "7h 29m 10s", "loss_scale": 1.0, "consumed_samples": 578816, "global_step/max_steps": "2261/6362"} +{"lm loss": 4.97974825, "grad_norm": 0.77914983, "learning_rate": 7.728e-05, "elapsed_time_per_iteration": 6.43701911, "memory(GiB)": 21.51, "elapsed_time": "4h 7m 44s", "remaining_time": "7h 29m 3s", "loss_scale": 1.0, "consumed_samples": 579072, "global_step/max_steps": "2262/6362"} +{"lm loss": 5.01157999, "grad_norm": 0.71573228, "learning_rate": 7.726e-05, "elapsed_time_per_iteration": 6.71473455, "memory(GiB)": 21.51, "elapsed_time": "4h 7m 51s", "remaining_time": "7h 28m 57s", "loss_scale": 1.0, "consumed_samples": 579328, "global_step/max_steps": "2263/6362"} +{"lm loss": 4.97667074, "grad_norm": 0.5861007, "learning_rate": 7.723e-05, "elapsed_time_per_iteration": 6.67233634, "memory(GiB)": 21.51, "elapsed_time": "4h 7m 58s", "remaining_time": "7h 28m 50s", "loss_scale": 1.0, "consumed_samples": 579584, "global_step/max_steps": "2264/6362"} +{"lm loss": 4.99150419, "grad_norm": 0.59286094, "learning_rate": 7.721e-05, "elapsed_time_per_iteration": 6.56590629, "memory(GiB)": 21.51, "elapsed_time": "4h 8m 4s", "remaining_time": "7h 28m 44s", "loss_scale": 1.0, "consumed_samples": 579840, "global_step/max_steps": "2265/6362"} +{"lm loss": 5.00205517, "grad_norm": 0.56274194, "learning_rate": 7.719e-05, "elapsed_time_per_iteration": 6.42292523, "memory(GiB)": 21.51, "elapsed_time": "4h 8m 11s", "remaining_time": "7h 28m 37s", "loss_scale": 1.0, "consumed_samples": 580096, "global_step/max_steps": "2266/6362"} +{"lm loss": 5.00385618, "grad_norm": 0.53897524, "learning_rate": 7.717e-05, "elapsed_time_per_iteration": 6.75891972, "memory(GiB)": 21.51, "elapsed_time": "4h 8m 17s", "remaining_time": "7h 28m 30s", "loss_scale": 1.0, "consumed_samples": 580352, "global_step/max_steps": "2267/6362"} +{"lm loss": 5.02458191, "grad_norm": 0.56944281, "learning_rate": 7.715e-05, "elapsed_time_per_iteration": 6.61824846, "memory(GiB)": 21.51, "elapsed_time": "4h 8m 24s", "remaining_time": "7h 28m 24s", "loss_scale": 1.0, "consumed_samples": 580608, "global_step/max_steps": "2268/6362"} +{"lm loss": 5.00763464, "grad_norm": 0.55756682, "learning_rate": 7.713e-05, "elapsed_time_per_iteration": 6.53756762, "memory(GiB)": 21.51, "elapsed_time": "4h 8m 31s", "remaining_time": "7h 28m 17s", "loss_scale": 1.0, "consumed_samples": 580864, "global_step/max_steps": "2269/6362"} +{"lm loss": 5.01168537, "grad_norm": 0.61330181, "learning_rate": 7.711e-05, "elapsed_time_per_iteration": 6.46153641, "memory(GiB)": 21.51, "elapsed_time": "4h 8m 37s", "remaining_time": "7h 28m 11s", "loss_scale": 1.0, "consumed_samples": 581120, "global_step/max_steps": "2270/6362"} +{"lm loss": 5.00991249, "grad_norm": 0.67104393, "learning_rate": 7.709e-05, "elapsed_time_per_iteration": 6.53441072, "memory(GiB)": 21.51, "elapsed_time": "4h 8m 44s", "remaining_time": "7h 28m 4s", "loss_scale": 1.0, "consumed_samples": 581376, "global_step/max_steps": "2271/6362"} +{"lm loss": 5.01418924, "grad_norm": 0.67511779, "learning_rate": 7.706e-05, "elapsed_time_per_iteration": 6.49696374, "memory(GiB)": 21.51, "elapsed_time": "4h 8m 50s", "remaining_time": "7h 27m 57s", "loss_scale": 1.0, "consumed_samples": 581632, "global_step/max_steps": "2272/6362"} +{"lm loss": 4.98848152, "grad_norm": 0.64220452, "learning_rate": 7.704e-05, "elapsed_time_per_iteration": 6.56662917, "memory(GiB)": 21.51, "elapsed_time": "4h 8m 57s", "remaining_time": "7h 27m 51s", "loss_scale": 1.0, "consumed_samples": 581888, "global_step/max_steps": "2273/6362"} +{"lm loss": 4.99924374, "grad_norm": 0.71804601, "learning_rate": 7.702e-05, "elapsed_time_per_iteration": 6.44957709, "memory(GiB)": 21.51, "elapsed_time": "4h 9m 3s", "remaining_time": "7h 27m 44s", "loss_scale": 1.0, "consumed_samples": 582144, "global_step/max_steps": "2274/6362"} +{"lm loss": 5.01241446, "grad_norm": 0.85043937, "learning_rate": 7.7e-05, "elapsed_time_per_iteration": 6.55668259, "memory(GiB)": 21.51, "elapsed_time": "4h 9m 10s", "remaining_time": "7h 27m 37s", "loss_scale": 1.0, "consumed_samples": 582400, "global_step/max_steps": "2275/6362"} +{"lm loss": 4.99622822, "grad_norm": 0.88985962, "learning_rate": 7.698e-05, "elapsed_time_per_iteration": 6.60364246, "memory(GiB)": 21.51, "elapsed_time": "4h 9m 16s", "remaining_time": "7h 27m 31s", "loss_scale": 1.0, "consumed_samples": 582656, "global_step/max_steps": "2276/6362"} +{"lm loss": 4.98993683, "grad_norm": 0.79709387, "learning_rate": 7.696e-05, "elapsed_time_per_iteration": 6.7337122, "memory(GiB)": 21.51, "elapsed_time": "4h 9m 23s", "remaining_time": "7h 27m 24s", "loss_scale": 1.0, "consumed_samples": 582912, "global_step/max_steps": "2277/6362"} +{"lm loss": 4.98518467, "grad_norm": 0.74533212, "learning_rate": 7.693e-05, "elapsed_time_per_iteration": 6.43362737, "memory(GiB)": 21.51, "elapsed_time": "4h 9m 29s", "remaining_time": "7h 27m 18s", "loss_scale": 1.0, "consumed_samples": 583168, "global_step/max_steps": "2278/6362"} +{"lm loss": 5.00603104, "grad_norm": 0.68719411, "learning_rate": 7.691e-05, "elapsed_time_per_iteration": 6.6230619, "memory(GiB)": 21.51, "elapsed_time": "4h 9m 36s", "remaining_time": "7h 27m 11s", "loss_scale": 1.0, "consumed_samples": 583424, "global_step/max_steps": "2279/6362"} +{"lm loss": 4.99121475, "grad_norm": 0.61078852, "learning_rate": 7.689e-05, "elapsed_time_per_iteration": 6.79965162, "memory(GiB)": 21.51, "elapsed_time": "4h 9m 43s", "remaining_time": "7h 27m 5s", "loss_scale": 1.0, "consumed_samples": 583680, "global_step/max_steps": "2280/6362"} +{"lm loss": 5.00608635, "grad_norm": 0.60520816, "learning_rate": 7.687e-05, "elapsed_time_per_iteration": 6.36885428, "memory(GiB)": 21.51, "elapsed_time": "4h 9m 49s", "remaining_time": "7h 26m 58s", "loss_scale": 1.0, "consumed_samples": 583936, "global_step/max_steps": "2281/6362"} +{"lm loss": 4.99750376, "grad_norm": 0.60738552, "learning_rate": 7.685e-05, "elapsed_time_per_iteration": 6.57641912, "memory(GiB)": 21.51, "elapsed_time": "4h 9m 56s", "remaining_time": "7h 26m 52s", "loss_scale": 1.0, "consumed_samples": 584192, "global_step/max_steps": "2282/6362"} +{"lm loss": 4.97786903, "grad_norm": 0.54550785, "learning_rate": 7.683e-05, "elapsed_time_per_iteration": 6.35040164, "memory(GiB)": 21.51, "elapsed_time": "4h 10m 2s", "remaining_time": "7h 26m 45s", "loss_scale": 1.0, "consumed_samples": 584448, "global_step/max_steps": "2283/6362"} +{"lm loss": 4.99279451, "grad_norm": 0.62762672, "learning_rate": 7.681e-05, "elapsed_time_per_iteration": 6.56926012, "memory(GiB)": 21.51, "elapsed_time": "4h 10m 9s", "remaining_time": "7h 26m 38s", "loss_scale": 1.0, "consumed_samples": 584704, "global_step/max_steps": "2284/6362"} +{"lm loss": 4.99837875, "grad_norm": 0.62240267, "learning_rate": 7.678e-05, "elapsed_time_per_iteration": 6.5923562, "memory(GiB)": 21.51, "elapsed_time": "4h 10m 15s", "remaining_time": "7h 26m 31s", "loss_scale": 1.0, "consumed_samples": 584960, "global_step/max_steps": "2285/6362"} +{"lm loss": 4.99472809, "grad_norm": 0.63703579, "learning_rate": 7.676e-05, "elapsed_time_per_iteration": 6.58221483, "memory(GiB)": 21.51, "elapsed_time": "4h 10m 22s", "remaining_time": "7h 26m 25s", "loss_scale": 1.0, "consumed_samples": 585216, "global_step/max_steps": "2286/6362"} +{"lm loss": 4.97259617, "grad_norm": 0.75882363, "learning_rate": 7.674e-05, "elapsed_time_per_iteration": 6.60956526, "memory(GiB)": 21.51, "elapsed_time": "4h 10m 29s", "remaining_time": "7h 26m 18s", "loss_scale": 1.0, "consumed_samples": 585472, "global_step/max_steps": "2287/6362"} +{"lm loss": 4.98463678, "grad_norm": 0.77708822, "learning_rate": 7.672e-05, "elapsed_time_per_iteration": 6.62220144, "memory(GiB)": 21.51, "elapsed_time": "4h 10m 35s", "remaining_time": "7h 26m 12s", "loss_scale": 1.0, "consumed_samples": 585728, "global_step/max_steps": "2288/6362"} +{"lm loss": 4.95764923, "grad_norm": 0.72589278, "learning_rate": 7.67e-05, "elapsed_time_per_iteration": 6.45537305, "memory(GiB)": 21.51, "elapsed_time": "4h 10m 42s", "remaining_time": "7h 26m 5s", "loss_scale": 1.0, "consumed_samples": 585984, "global_step/max_steps": "2289/6362"} +{"lm loss": 5.00321484, "grad_norm": 0.79142851, "learning_rate": 7.668e-05, "elapsed_time_per_iteration": 6.68973064, "memory(GiB)": 21.51, "elapsed_time": "4h 10m 48s", "remaining_time": "7h 25m 59s", "loss_scale": 1.0, "consumed_samples": 586240, "global_step/max_steps": "2290/6362"} +{"lm loss": 5.02646828, "grad_norm": 0.75777638, "learning_rate": 7.666e-05, "elapsed_time_per_iteration": 6.48174691, "memory(GiB)": 21.51, "elapsed_time": "4h 10m 55s", "remaining_time": "7h 25m 52s", "loss_scale": 1.0, "consumed_samples": 586496, "global_step/max_steps": "2291/6362"} +{"lm loss": 4.98317337, "grad_norm": 0.80108887, "learning_rate": 7.663e-05, "elapsed_time_per_iteration": 6.39149165, "memory(GiB)": 21.51, "elapsed_time": "4h 11m 1s", "remaining_time": "7h 25m 45s", "loss_scale": 1.0, "consumed_samples": 586752, "global_step/max_steps": "2292/6362"} +{"lm loss": 5.00822115, "grad_norm": 0.7282781, "learning_rate": 7.661e-05, "elapsed_time_per_iteration": 6.38198233, "memory(GiB)": 21.51, "elapsed_time": "4h 11m 8s", "remaining_time": "7h 25m 38s", "loss_scale": 1.0, "consumed_samples": 587008, "global_step/max_steps": "2293/6362"} +{"lm loss": 4.98453808, "grad_norm": 0.67545003, "learning_rate": 7.659e-05, "elapsed_time_per_iteration": 6.39177847, "memory(GiB)": 21.51, "elapsed_time": "4h 11m 14s", "remaining_time": "7h 25m 31s", "loss_scale": 1.0, "consumed_samples": 587264, "global_step/max_steps": "2294/6362"} +{"lm loss": 4.99536943, "grad_norm": 0.70422286, "learning_rate": 7.657e-05, "elapsed_time_per_iteration": 6.49761033, "memory(GiB)": 21.51, "elapsed_time": "4h 11m 20s", "remaining_time": "7h 25m 25s", "loss_scale": 1.0, "consumed_samples": 587520, "global_step/max_steps": "2295/6362"} +{"lm loss": 4.97821569, "grad_norm": 0.67163801, "learning_rate": 7.655e-05, "elapsed_time_per_iteration": 6.64553928, "memory(GiB)": 21.51, "elapsed_time": "4h 11m 27s", "remaining_time": "7h 25m 18s", "loss_scale": 1.0, "consumed_samples": 587776, "global_step/max_steps": "2296/6362"} +{"lm loss": 4.97712517, "grad_norm": 0.66527259, "learning_rate": 7.653e-05, "elapsed_time_per_iteration": 6.45959496, "memory(GiB)": 21.51, "elapsed_time": "4h 11m 34s", "remaining_time": "7h 25m 11s", "loss_scale": 1.0, "consumed_samples": 588032, "global_step/max_steps": "2297/6362"} +{"lm loss": 4.98198128, "grad_norm": 0.73574167, "learning_rate": 7.65e-05, "elapsed_time_per_iteration": 6.57026815, "memory(GiB)": 21.51, "elapsed_time": "4h 11m 40s", "remaining_time": "7h 25m 5s", "loss_scale": 1.0, "consumed_samples": 588288, "global_step/max_steps": "2298/6362"} +{"lm loss": 4.98660803, "grad_norm": 0.57670045, "learning_rate": 7.648e-05, "elapsed_time_per_iteration": 6.66714239, "memory(GiB)": 21.51, "elapsed_time": "4h 11m 47s", "remaining_time": "7h 24m 58s", "loss_scale": 1.0, "consumed_samples": 588544, "global_step/max_steps": "2299/6362"} +{"lm loss": 5.00316381, "grad_norm": 0.5690167, "learning_rate": 7.646e-05, "elapsed_time_per_iteration": 6.62058783, "memory(GiB)": 21.51, "elapsed_time": "4h 11m 53s", "remaining_time": "7h 24m 52s", "loss_scale": 1.0, "consumed_samples": 588800, "global_step/max_steps": "2300/6362"} +{"lm loss": 5.01206255, "grad_norm": 0.72980744, "learning_rate": 7.644e-05, "elapsed_time_per_iteration": 6.32893014, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 0s", "remaining_time": "7h 24m 45s", "loss_scale": 1.0, "consumed_samples": 589056, "global_step/max_steps": "2301/6362"} +{"lm loss": 5.00171852, "grad_norm": 0.60406864, "learning_rate": 7.642e-05, "elapsed_time_per_iteration": 6.50308609, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 6s", "remaining_time": "7h 24m 38s", "loss_scale": 1.0, "consumed_samples": 589312, "global_step/max_steps": "2302/6362"} +{"lm loss": 4.9879446, "grad_norm": 0.54903072, "learning_rate": 7.64e-05, "elapsed_time_per_iteration": 6.80639172, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 13s", "remaining_time": "7h 24m 32s", "loss_scale": 1.0, "consumed_samples": 589568, "global_step/max_steps": "2303/6362"} +{"lm loss": 5.00979805, "grad_norm": 0.55455905, "learning_rate": 7.637e-05, "elapsed_time_per_iteration": 6.60555673, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 20s", "remaining_time": "7h 24m 26s", "loss_scale": 1.0, "consumed_samples": 589824, "global_step/max_steps": "2304/6362"} +{"lm loss": 5.01815081, "grad_norm": 0.55853754, "learning_rate": 7.635e-05, "elapsed_time_per_iteration": 6.65950942, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 26s", "remaining_time": "7h 24m 19s", "loss_scale": 1.0, "consumed_samples": 590080, "global_step/max_steps": "2305/6362"} +{"lm loss": 4.97870922, "grad_norm": 0.55462581, "learning_rate": 7.633e-05, "elapsed_time_per_iteration": 6.41145682, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 33s", "remaining_time": "7h 24m 12s", "loss_scale": 1.0, "consumed_samples": 590336, "global_step/max_steps": "2306/6362"} +{"lm loss": 5.00163364, "grad_norm": 0.60965776, "learning_rate": 7.631e-05, "elapsed_time_per_iteration": 6.60791659, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 39s", "remaining_time": "7h 24m 6s", "loss_scale": 1.0, "consumed_samples": 590592, "global_step/max_steps": "2307/6362"} +{"lm loss": 5.01993418, "grad_norm": 0.62694079, "learning_rate": 7.629e-05, "elapsed_time_per_iteration": 6.54407454, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 46s", "remaining_time": "7h 23m 59s", "loss_scale": 1.0, "consumed_samples": 590848, "global_step/max_steps": "2308/6362"} +{"lm loss": 5.00320625, "grad_norm": 0.65575117, "learning_rate": 7.627e-05, "elapsed_time_per_iteration": 6.57116508, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 52s", "remaining_time": "7h 23m 53s", "loss_scale": 1.0, "consumed_samples": 591104, "global_step/max_steps": "2309/6362"} +{"lm loss": 4.99956512, "grad_norm": 0.64572555, "learning_rate": 7.624e-05, "elapsed_time_per_iteration": 6.8793323, "memory(GiB)": 21.51, "elapsed_time": "4h 12m 59s", "remaining_time": "7h 23m 47s", "loss_scale": 1.0, "consumed_samples": 591360, "global_step/max_steps": "2310/6362"} +{"lm loss": 5.00589466, "grad_norm": 0.6841777, "learning_rate": 7.622e-05, "elapsed_time_per_iteration": 6.46442342, "memory(GiB)": 21.51, "elapsed_time": "4h 13m 6s", "remaining_time": "7h 23m 40s", "loss_scale": 1.0, "consumed_samples": 591616, "global_step/max_steps": "2311/6362"} +{"lm loss": 4.97828245, "grad_norm": 0.68833411, "learning_rate": 7.62e-05, "elapsed_time_per_iteration": 6.57135105, "memory(GiB)": 21.51, "elapsed_time": "4h 13m 12s", "remaining_time": "7h 23m 33s", "loss_scale": 1.0, "consumed_samples": 591872, "global_step/max_steps": "2312/6362"} +{"lm loss": 5.00118446, "grad_norm": 0.60850501, "learning_rate": 7.618e-05, "elapsed_time_per_iteration": 6.65016842, "memory(GiB)": 21.51, "elapsed_time": "4h 13m 19s", "remaining_time": "7h 23m 27s", "loss_scale": 1.0, "consumed_samples": 592128, "global_step/max_steps": "2313/6362"} +{"lm loss": 4.98756266, "grad_norm": 0.61706156, "learning_rate": 7.616e-05, "elapsed_time_per_iteration": 6.64490438, "memory(GiB)": 21.51, "elapsed_time": "4h 13m 26s", "remaining_time": "7h 23m 20s", "loss_scale": 1.0, "consumed_samples": 592384, "global_step/max_steps": "2314/6362"} +{"lm loss": 4.98498011, "grad_norm": 0.60215044, "learning_rate": 7.614e-05, "elapsed_time_per_iteration": 6.70138121, "memory(GiB)": 21.51, "elapsed_time": "4h 13m 32s", "remaining_time": "7h 23m 14s", "loss_scale": 1.0, "consumed_samples": 592640, "global_step/max_steps": "2315/6362"} +{"lm loss": 4.98301363, "grad_norm": 0.5326997, "learning_rate": 7.611e-05, "elapsed_time_per_iteration": 6.57303452, "memory(GiB)": 21.51, "elapsed_time": "4h 13m 39s", "remaining_time": "7h 23m 8s", "loss_scale": 1.0, "consumed_samples": 592896, "global_step/max_steps": "2316/6362"} +{"lm loss": 4.98622513, "grad_norm": 0.5764001, "learning_rate": 7.609e-05, "elapsed_time_per_iteration": 6.76760006, "memory(GiB)": 21.51, "elapsed_time": "4h 13m 46s", "remaining_time": "7h 23m 1s", "loss_scale": 1.0, "consumed_samples": 593152, "global_step/max_steps": "2317/6362"} +{"lm loss": 5.00824833, "grad_norm": 0.57670486, "learning_rate": 7.607e-05, "elapsed_time_per_iteration": 6.56966162, "memory(GiB)": 21.51, "elapsed_time": "4h 13m 52s", "remaining_time": "7h 22m 55s", "loss_scale": 1.0, "consumed_samples": 593408, "global_step/max_steps": "2318/6362"} +{"lm loss": 4.96479797, "grad_norm": 0.61172909, "learning_rate": 7.605e-05, "elapsed_time_per_iteration": 6.63920259, "memory(GiB)": 21.51, "elapsed_time": "4h 13m 59s", "remaining_time": "7h 22m 48s", "loss_scale": 1.0, "consumed_samples": 593664, "global_step/max_steps": "2319/6362"} +{"lm loss": 4.99964762, "grad_norm": 0.68965721, "learning_rate": 7.603e-05, "elapsed_time_per_iteration": 6.64357185, "memory(GiB)": 21.51, "elapsed_time": "4h 14m 6s", "remaining_time": "7h 22m 42s", "loss_scale": 1.0, "consumed_samples": 593920, "global_step/max_steps": "2320/6362"} +{"lm loss": 5.01064014, "grad_norm": 0.63190317, "learning_rate": 7.601e-05, "elapsed_time_per_iteration": 6.61599374, "memory(GiB)": 21.51, "elapsed_time": "4h 14m 12s", "remaining_time": "7h 22m 35s", "loss_scale": 1.0, "consumed_samples": 594176, "global_step/max_steps": "2321/6362"} +{"lm loss": 4.99855709, "grad_norm": 0.52546102, "learning_rate": 7.598e-05, "elapsed_time_per_iteration": 6.64037347, "memory(GiB)": 21.51, "elapsed_time": "4h 14m 19s", "remaining_time": "7h 22m 29s", "loss_scale": 1.0, "consumed_samples": 594432, "global_step/max_steps": "2322/6362"} +{"lm loss": 5.00012016, "grad_norm": 0.55105847, "learning_rate": 7.596e-05, "elapsed_time_per_iteration": 6.70401073, "memory(GiB)": 21.51, "elapsed_time": "4h 14m 26s", "remaining_time": "7h 22m 23s", "loss_scale": 1.0, "consumed_samples": 594688, "global_step/max_steps": "2323/6362"} +{"lm loss": 4.99449968, "grad_norm": 0.5441314, "learning_rate": 7.594e-05, "elapsed_time_per_iteration": 6.56026626, "memory(GiB)": 21.51, "elapsed_time": "4h 14m 32s", "remaining_time": "7h 22m 16s", "loss_scale": 1.0, "consumed_samples": 594944, "global_step/max_steps": "2324/6362"} +{"lm loss": 4.97750711, "grad_norm": 0.58183718, "learning_rate": 7.592e-05, "elapsed_time_per_iteration": 6.59219122, "memory(GiB)": 21.51, "elapsed_time": "4h 14m 39s", "remaining_time": "7h 22m 9s", "loss_scale": 1.0, "consumed_samples": 595200, "global_step/max_steps": "2325/6362"} +{"lm loss": 4.99124527, "grad_norm": 0.68660527, "learning_rate": 7.59e-05, "elapsed_time_per_iteration": 6.67254353, "memory(GiB)": 21.51, "elapsed_time": "4h 14m 45s", "remaining_time": "7h 22m 3s", "loss_scale": 1.0, "consumed_samples": 595456, "global_step/max_steps": "2326/6362"} +{"lm loss": 5.00141335, "grad_norm": 0.78870511, "learning_rate": 7.588e-05, "elapsed_time_per_iteration": 6.58047438, "memory(GiB)": 21.51, "elapsed_time": "4h 14m 52s", "remaining_time": "7h 21m 56s", "loss_scale": 1.0, "consumed_samples": 595712, "global_step/max_steps": "2327/6362"} +{"lm loss": 4.9932847, "grad_norm": 0.78248709, "learning_rate": 7.585e-05, "elapsed_time_per_iteration": 6.48501754, "memory(GiB)": 21.51, "elapsed_time": "4h 14m 58s", "remaining_time": "7h 21m 50s", "loss_scale": 1.0, "consumed_samples": 595968, "global_step/max_steps": "2328/6362"} +{"lm loss": 4.98408365, "grad_norm": 0.83788359, "learning_rate": 7.583e-05, "elapsed_time_per_iteration": 6.4653368, "memory(GiB)": 21.51, "elapsed_time": "4h 15m 5s", "remaining_time": "7h 21m 43s", "loss_scale": 1.0, "consumed_samples": 596224, "global_step/max_steps": "2329/6362"} +{"lm loss": 5.00923347, "grad_norm": 0.83622462, "learning_rate": 7.581e-05, "elapsed_time_per_iteration": 6.58070469, "memory(GiB)": 21.51, "elapsed_time": "4h 15m 11s", "remaining_time": "7h 21m 36s", "loss_scale": 1.0, "consumed_samples": 596480, "global_step/max_steps": "2330/6362"} +{"lm loss": 4.99512339, "grad_norm": 0.73260111, "learning_rate": 7.579e-05, "elapsed_time_per_iteration": 6.54437327, "memory(GiB)": 21.51, "elapsed_time": "4h 15m 18s", "remaining_time": "7h 21m 30s", "loss_scale": 1.0, "consumed_samples": 596736, "global_step/max_steps": "2331/6362"} +{"lm loss": 4.98697996, "grad_norm": 0.64990896, "learning_rate": 7.577e-05, "elapsed_time_per_iteration": 6.55706716, "memory(GiB)": 21.51, "elapsed_time": "4h 15m 25s", "remaining_time": "7h 21m 23s", "loss_scale": 1.0, "consumed_samples": 596992, "global_step/max_steps": "2332/6362"} +{"lm loss": 4.98614597, "grad_norm": 0.60779679, "learning_rate": 7.574e-05, "elapsed_time_per_iteration": 6.55585885, "memory(GiB)": 21.51, "elapsed_time": "4h 15m 31s", "remaining_time": "7h 21m 17s", "loss_scale": 1.0, "consumed_samples": 597248, "global_step/max_steps": "2333/6362"} +{"lm loss": 4.97729254, "grad_norm": 0.58784676, "learning_rate": 7.572e-05, "elapsed_time_per_iteration": 6.64105463, "memory(GiB)": 21.51, "elapsed_time": "4h 15m 38s", "remaining_time": "7h 21m 10s", "loss_scale": 1.0, "consumed_samples": 597504, "global_step/max_steps": "2334/6362"} +{"lm loss": 4.9790659, "grad_norm": 0.53145063, "learning_rate": 7.57e-05, "elapsed_time_per_iteration": 6.56383467, "memory(GiB)": 21.51, "elapsed_time": "4h 15m 44s", "remaining_time": "7h 21m 4s", "loss_scale": 1.0, "consumed_samples": 597760, "global_step/max_steps": "2335/6362"} +{"lm loss": 4.98326731, "grad_norm": 0.57028061, "learning_rate": 7.568e-05, "elapsed_time_per_iteration": 6.78751469, "memory(GiB)": 21.51, "elapsed_time": "4h 15m 51s", "remaining_time": "7h 20m 57s", "loss_scale": 1.0, "consumed_samples": 598016, "global_step/max_steps": "2336/6362"} +{"lm loss": 4.99221039, "grad_norm": 0.63316268, "learning_rate": 7.566e-05, "elapsed_time_per_iteration": 6.41234136, "memory(GiB)": 21.51, "elapsed_time": "4h 15m 58s", "remaining_time": "7h 20m 51s", "loss_scale": 1.0, "consumed_samples": 598272, "global_step/max_steps": "2337/6362"} +{"lm loss": 4.99755573, "grad_norm": 0.70262259, "learning_rate": 7.563e-05, "elapsed_time_per_iteration": 6.46775556, "memory(GiB)": 21.51, "elapsed_time": "4h 16m 4s", "remaining_time": "7h 20m 44s", "loss_scale": 1.0, "consumed_samples": 598528, "global_step/max_steps": "2338/6362"} +{"lm loss": 4.99850273, "grad_norm": 0.72189999, "learning_rate": 7.561e-05, "elapsed_time_per_iteration": 6.6059649, "memory(GiB)": 21.51, "elapsed_time": "4h 16m 11s", "remaining_time": "7h 20m 37s", "loss_scale": 1.0, "consumed_samples": 598784, "global_step/max_steps": "2339/6362"} +{"lm loss": 4.96492386, "grad_norm": 0.75412965, "learning_rate": 7.559e-05, "elapsed_time_per_iteration": 6.57555366, "memory(GiB)": 21.51, "elapsed_time": "4h 16m 17s", "remaining_time": "7h 20m 31s", "loss_scale": 1.0, "consumed_samples": 599040, "global_step/max_steps": "2340/6362"} +{"lm loss": 4.99525642, "grad_norm": 0.82049149, "learning_rate": 7.557e-05, "elapsed_time_per_iteration": 6.35716891, "memory(GiB)": 21.51, "elapsed_time": "4h 16m 24s", "remaining_time": "7h 20m 24s", "loss_scale": 1.0, "consumed_samples": 599296, "global_step/max_steps": "2341/6362"} +{"lm loss": 4.99264479, "grad_norm": 0.73554897, "learning_rate": 7.555e-05, "elapsed_time_per_iteration": 6.61275411, "memory(GiB)": 21.51, "elapsed_time": "4h 16m 30s", "remaining_time": "7h 20m 17s", "loss_scale": 1.0, "consumed_samples": 599552, "global_step/max_steps": "2342/6362"} +{"lm loss": 4.98316288, "grad_norm": 0.61500967, "learning_rate": 7.553e-05, "elapsed_time_per_iteration": 6.41624236, "memory(GiB)": 21.51, "elapsed_time": "4h 16m 37s", "remaining_time": "7h 20m 10s", "loss_scale": 1.0, "consumed_samples": 599808, "global_step/max_steps": "2343/6362"} +{"lm loss": 4.970963, "grad_norm": 0.56497669, "learning_rate": 7.55e-05, "elapsed_time_per_iteration": 6.58524966, "memory(GiB)": 21.51, "elapsed_time": "4h 16m 43s", "remaining_time": "7h 20m 4s", "loss_scale": 1.0, "consumed_samples": 600064, "global_step/max_steps": "2344/6362"} +{"lm loss": 5.01791668, "grad_norm": 0.59885418, "learning_rate": 7.548e-05, "elapsed_time_per_iteration": 6.70186782, "memory(GiB)": 21.51, "elapsed_time": "4h 16m 50s", "remaining_time": "7h 19m 58s", "loss_scale": 1.0, "consumed_samples": 600320, "global_step/max_steps": "2345/6362"} +{"lm loss": 4.99827385, "grad_norm": 0.55452865, "learning_rate": 7.546e-05, "elapsed_time_per_iteration": 6.42823696, "memory(GiB)": 21.51, "elapsed_time": "4h 16m 56s", "remaining_time": "7h 19m 51s", "loss_scale": 1.0, "consumed_samples": 600576, "global_step/max_steps": "2346/6362"} +{"lm loss": 4.95162106, "grad_norm": 0.57839584, "learning_rate": 7.544e-05, "elapsed_time_per_iteration": 6.57748914, "memory(GiB)": 21.51, "elapsed_time": "4h 17m 3s", "remaining_time": "7h 19m 44s", "loss_scale": 1.0, "consumed_samples": 600832, "global_step/max_steps": "2347/6362"} +{"lm loss": 5.01509476, "grad_norm": 0.62232739, "learning_rate": 7.542e-05, "elapsed_time_per_iteration": 6.5047667, "memory(GiB)": 21.51, "elapsed_time": "4h 17m 9s", "remaining_time": "7h 19m 37s", "loss_scale": 1.0, "consumed_samples": 601088, "global_step/max_steps": "2348/6362"} +{"lm loss": 5.00412083, "grad_norm": 0.59928995, "learning_rate": 7.539e-05, "elapsed_time_per_iteration": 6.578372, "memory(GiB)": 21.51, "elapsed_time": "4h 17m 16s", "remaining_time": "7h 19m 31s", "loss_scale": 1.0, "consumed_samples": 601344, "global_step/max_steps": "2349/6362"} +{"lm loss": 4.98706579, "grad_norm": 0.60761863, "learning_rate": 7.537e-05, "elapsed_time_per_iteration": 6.48964787, "memory(GiB)": 21.51, "elapsed_time": "4h 17m 22s", "remaining_time": "7h 19m 24s", "loss_scale": 1.0, "consumed_samples": 601600, "global_step/max_steps": "2350/6362"} +{"lm loss": 4.98412848, "grad_norm": 0.53641576, "learning_rate": 7.535e-05, "elapsed_time_per_iteration": 6.69176888, "memory(GiB)": 21.51, "elapsed_time": "4h 17m 29s", "remaining_time": "7h 19m 18s", "loss_scale": 1.0, "consumed_samples": 601856, "global_step/max_steps": "2351/6362"} +{"lm loss": 4.99776554, "grad_norm": 0.59341335, "learning_rate": 7.533e-05, "elapsed_time_per_iteration": 6.61677957, "memory(GiB)": 21.51, "elapsed_time": "4h 17m 36s", "remaining_time": "7h 19m 11s", "loss_scale": 1.0, "consumed_samples": 602112, "global_step/max_steps": "2352/6362"} +{"lm loss": 4.96674538, "grad_norm": 0.56193829, "learning_rate": 7.531e-05, "elapsed_time_per_iteration": 6.63613081, "memory(GiB)": 21.51, "elapsed_time": "4h 17m 42s", "remaining_time": "7h 19m 5s", "loss_scale": 1.0, "consumed_samples": 602368, "global_step/max_steps": "2353/6362"} +{"lm loss": 4.98342657, "grad_norm": 0.66200531, "learning_rate": 7.528e-05, "elapsed_time_per_iteration": 6.51953936, "memory(GiB)": 21.51, "elapsed_time": "4h 17m 49s", "remaining_time": "7h 18m 58s", "loss_scale": 1.0, "consumed_samples": 602624, "global_step/max_steps": "2354/6362"} +{"lm loss": 4.98382616, "grad_norm": 0.72912139, "learning_rate": 7.526e-05, "elapsed_time_per_iteration": 6.47928667, "memory(GiB)": 21.51, "elapsed_time": "4h 17m 55s", "remaining_time": "7h 18m 51s", "loss_scale": 1.0, "consumed_samples": 602880, "global_step/max_steps": "2355/6362"} +{"lm loss": 5.0077014, "grad_norm": 0.74574989, "learning_rate": 7.524e-05, "elapsed_time_per_iteration": 6.63385248, "memory(GiB)": 21.51, "elapsed_time": "4h 18m 2s", "remaining_time": "7h 18m 45s", "loss_scale": 1.0, "consumed_samples": 603136, "global_step/max_steps": "2356/6362"} +{"lm loss": 5.00090885, "grad_norm": 0.73165792, "learning_rate": 7.522e-05, "elapsed_time_per_iteration": 6.44998527, "memory(GiB)": 21.51, "elapsed_time": "4h 18m 8s", "remaining_time": "7h 18m 38s", "loss_scale": 1.0, "consumed_samples": 603392, "global_step/max_steps": "2357/6362"} +{"lm loss": 4.99398661, "grad_norm": 0.78618902, "learning_rate": 7.52e-05, "elapsed_time_per_iteration": 6.53639102, "memory(GiB)": 21.51, "elapsed_time": "4h 18m 15s", "remaining_time": "7h 18m 32s", "loss_scale": 1.0, "consumed_samples": 603648, "global_step/max_steps": "2358/6362"} +{"lm loss": 4.98604536, "grad_norm": 0.74824643, "learning_rate": 7.517e-05, "elapsed_time_per_iteration": 6.57979155, "memory(GiB)": 21.51, "elapsed_time": "4h 18m 22s", "remaining_time": "7h 18m 25s", "loss_scale": 1.0, "consumed_samples": 603904, "global_step/max_steps": "2359/6362"} +{"lm loss": 4.98932648, "grad_norm": 0.69170457, "learning_rate": 7.515e-05, "elapsed_time_per_iteration": 6.71945548, "memory(GiB)": 21.51, "elapsed_time": "4h 18m 28s", "remaining_time": "7h 18m 19s", "loss_scale": 1.0, "consumed_samples": 604160, "global_step/max_steps": "2360/6362"} +{"lm loss": 4.98955297, "grad_norm": 0.69165123, "learning_rate": 7.513e-05, "elapsed_time_per_iteration": 6.60732079, "memory(GiB)": 21.51, "elapsed_time": "4h 18m 35s", "remaining_time": "7h 18m 12s", "loss_scale": 1.0, "consumed_samples": 604416, "global_step/max_steps": "2361/6362"} +{"lm loss": 5.01887226, "grad_norm": 0.61772907, "learning_rate": 7.511e-05, "elapsed_time_per_iteration": 6.61678505, "memory(GiB)": 21.51, "elapsed_time": "4h 18m 42s", "remaining_time": "7h 18m 6s", "loss_scale": 1.0, "consumed_samples": 604672, "global_step/max_steps": "2362/6362"} +{"lm loss": 4.99733686, "grad_norm": 0.68349624, "learning_rate": 7.509e-05, "elapsed_time_per_iteration": 6.88904786, "memory(GiB)": 21.51, "elapsed_time": "4h 18m 48s", "remaining_time": "7h 18m 0s", "loss_scale": 1.0, "consumed_samples": 604928, "global_step/max_steps": "2363/6362"} +{"lm loss": 4.99698687, "grad_norm": 0.61372298, "learning_rate": 7.506e-05, "elapsed_time_per_iteration": 6.82477975, "memory(GiB)": 21.51, "elapsed_time": "4h 18m 55s", "remaining_time": "7h 17m 54s", "loss_scale": 1.0, "consumed_samples": 605184, "global_step/max_steps": "2364/6362"} +{"lm loss": 4.97929859, "grad_norm": 0.60433745, "learning_rate": 7.504e-05, "elapsed_time_per_iteration": 6.52620816, "memory(GiB)": 21.51, "elapsed_time": "4h 19m 2s", "remaining_time": "7h 17m 47s", "loss_scale": 1.0, "consumed_samples": 605440, "global_step/max_steps": "2365/6362"} +{"lm loss": 4.9724102, "grad_norm": 0.57842225, "learning_rate": 7.502e-05, "elapsed_time_per_iteration": 6.7898438, "memory(GiB)": 21.51, "elapsed_time": "4h 19m 9s", "remaining_time": "7h 17m 41s", "loss_scale": 1.0, "consumed_samples": 605696, "global_step/max_steps": "2366/6362"} +{"lm loss": 4.97380686, "grad_norm": 0.55214381, "learning_rate": 7.5e-05, "elapsed_time_per_iteration": 6.76973796, "memory(GiB)": 21.51, "elapsed_time": "4h 19m 15s", "remaining_time": "7h 17m 34s", "loss_scale": 1.0, "consumed_samples": 605952, "global_step/max_steps": "2367/6362"} +{"lm loss": 5.01697731, "grad_norm": 0.61579579, "learning_rate": 7.498e-05, "elapsed_time_per_iteration": 6.68506575, "memory(GiB)": 21.51, "elapsed_time": "4h 19m 22s", "remaining_time": "7h 17m 28s", "loss_scale": 1.0, "consumed_samples": 606208, "global_step/max_steps": "2368/6362"} +{"lm loss": 4.96860552, "grad_norm": 0.62625152, "learning_rate": 7.495e-05, "elapsed_time_per_iteration": 6.61017179, "memory(GiB)": 21.51, "elapsed_time": "4h 19m 29s", "remaining_time": "7h 17m 22s", "loss_scale": 1.0, "consumed_samples": 606464, "global_step/max_steps": "2369/6362"} +{"lm loss": 4.9708147, "grad_norm": 0.5916093, "learning_rate": 7.493e-05, "elapsed_time_per_iteration": 6.72574639, "memory(GiB)": 21.51, "elapsed_time": "4h 19m 35s", "remaining_time": "7h 17m 15s", "loss_scale": 1.0, "consumed_samples": 606720, "global_step/max_steps": "2370/6362"} +{"lm loss": 4.97264957, "grad_norm": 0.52877754, "learning_rate": 7.491e-05, "elapsed_time_per_iteration": 6.65715146, "memory(GiB)": 21.51, "elapsed_time": "4h 19m 42s", "remaining_time": "7h 17m 9s", "loss_scale": 1.0, "consumed_samples": 606976, "global_step/max_steps": "2371/6362"} +{"lm loss": 5.00091743, "grad_norm": 0.58635563, "learning_rate": 7.489e-05, "elapsed_time_per_iteration": 6.57063985, "memory(GiB)": 21.51, "elapsed_time": "4h 19m 49s", "remaining_time": "7h 17m 2s", "loss_scale": 1.0, "consumed_samples": 607232, "global_step/max_steps": "2372/6362"} +{"lm loss": 4.99679756, "grad_norm": 0.66606843, "learning_rate": 7.487e-05, "elapsed_time_per_iteration": 6.57843351, "memory(GiB)": 21.51, "elapsed_time": "4h 19m 55s", "remaining_time": "7h 16m 56s", "loss_scale": 1.0, "consumed_samples": 607488, "global_step/max_steps": "2373/6362"} +{"lm loss": 4.99920702, "grad_norm": 0.60650486, "learning_rate": 7.484e-05, "elapsed_time_per_iteration": 6.52862334, "memory(GiB)": 21.51, "elapsed_time": "4h 20m 2s", "remaining_time": "7h 16m 49s", "loss_scale": 1.0, "consumed_samples": 607744, "global_step/max_steps": "2374/6362"} +{"lm loss": 4.98732328, "grad_norm": 0.53913367, "learning_rate": 7.482e-05, "elapsed_time_per_iteration": 6.73734379, "memory(GiB)": 21.51, "elapsed_time": "4h 20m 8s", "remaining_time": "7h 16m 43s", "loss_scale": 1.0, "consumed_samples": 608000, "global_step/max_steps": "2375/6362"} +{"lm loss": 4.97085094, "grad_norm": 0.6722759, "learning_rate": 7.48e-05, "elapsed_time_per_iteration": 6.56562304, "memory(GiB)": 21.51, "elapsed_time": "4h 20m 15s", "remaining_time": "7h 16m 36s", "loss_scale": 1.0, "consumed_samples": 608256, "global_step/max_steps": "2376/6362"} +{"lm loss": 5.00325203, "grad_norm": 0.76811111, "learning_rate": 7.478e-05, "elapsed_time_per_iteration": 6.54633641, "memory(GiB)": 21.51, "elapsed_time": "4h 20m 22s", "remaining_time": "7h 16m 30s", "loss_scale": 1.0, "consumed_samples": 608512, "global_step/max_steps": "2377/6362"} +{"lm loss": 4.97613478, "grad_norm": 0.7944808, "learning_rate": 7.476e-05, "elapsed_time_per_iteration": 6.4795084, "memory(GiB)": 21.51, "elapsed_time": "4h 20m 28s", "remaining_time": "7h 16m 23s", "loss_scale": 1.0, "consumed_samples": 608768, "global_step/max_steps": "2378/6362"} +{"lm loss": 4.99817991, "grad_norm": 0.91292679, "learning_rate": 7.473e-05, "elapsed_time_per_iteration": 6.74009657, "memory(GiB)": 21.51, "elapsed_time": "4h 20m 35s", "remaining_time": "7h 16m 17s", "loss_scale": 1.0, "consumed_samples": 609024, "global_step/max_steps": "2379/6362"} +{"lm loss": 4.99497795, "grad_norm": 0.8841334, "learning_rate": 7.471e-05, "elapsed_time_per_iteration": 6.62155724, "memory(GiB)": 21.51, "elapsed_time": "4h 20m 41s", "remaining_time": "7h 16m 10s", "loss_scale": 1.0, "consumed_samples": 609280, "global_step/max_steps": "2380/6362"} +{"lm loss": 5.0017643, "grad_norm": 0.63220274, "learning_rate": 7.469e-05, "elapsed_time_per_iteration": 6.69885302, "memory(GiB)": 21.51, "elapsed_time": "4h 20m 48s", "remaining_time": "7h 16m 4s", "loss_scale": 1.0, "consumed_samples": 609536, "global_step/max_steps": "2381/6362"} +{"lm loss": 4.96321964, "grad_norm": 0.63514692, "learning_rate": 7.467e-05, "elapsed_time_per_iteration": 6.7460537, "memory(GiB)": 21.51, "elapsed_time": "4h 20m 55s", "remaining_time": "7h 15m 57s", "loss_scale": 1.0, "consumed_samples": 609792, "global_step/max_steps": "2382/6362"} +{"lm loss": 4.98600721, "grad_norm": 0.78162283, "learning_rate": 7.464e-05, "elapsed_time_per_iteration": 6.66793752, "memory(GiB)": 21.51, "elapsed_time": "4h 21m 1s", "remaining_time": "7h 15m 51s", "loss_scale": 1.0, "consumed_samples": 610048, "global_step/max_steps": "2383/6362"} +{"lm loss": 5.01589537, "grad_norm": 0.67898989, "learning_rate": 7.462e-05, "elapsed_time_per_iteration": 6.62733793, "memory(GiB)": 21.51, "elapsed_time": "4h 21m 8s", "remaining_time": "7h 15m 44s", "loss_scale": 1.0, "consumed_samples": 610304, "global_step/max_steps": "2384/6362"} +{"lm loss": 4.99530602, "grad_norm": 0.67370462, "learning_rate": 7.46e-05, "elapsed_time_per_iteration": 6.46984935, "memory(GiB)": 21.51, "elapsed_time": "4h 21m 15s", "remaining_time": "7h 15m 38s", "loss_scale": 1.0, "consumed_samples": 610560, "global_step/max_steps": "2385/6362"} +{"lm loss": 4.97951841, "grad_norm": 0.63249898, "learning_rate": 7.458e-05, "elapsed_time_per_iteration": 6.47807002, "memory(GiB)": 21.51, "elapsed_time": "4h 21m 21s", "remaining_time": "7h 15m 31s", "loss_scale": 1.0, "consumed_samples": 610816, "global_step/max_steps": "2386/6362"} +{"lm loss": 4.9686799, "grad_norm": 0.53166866, "learning_rate": 7.456e-05, "elapsed_time_per_iteration": 6.51063752, "memory(GiB)": 21.51, "elapsed_time": "4h 21m 28s", "remaining_time": "7h 15m 24s", "loss_scale": 1.0, "consumed_samples": 611072, "global_step/max_steps": "2387/6362"} +{"lm loss": 4.97194242, "grad_norm": 0.62531453, "learning_rate": 7.453e-05, "elapsed_time_per_iteration": 6.58879685, "memory(GiB)": 21.51, "elapsed_time": "4h 21m 34s", "remaining_time": "7h 15m 18s", "loss_scale": 1.0, "consumed_samples": 611328, "global_step/max_steps": "2388/6362"} +{"lm loss": 4.97817039, "grad_norm": 0.54334182, "learning_rate": 7.451e-05, "elapsed_time_per_iteration": 6.60305834, "memory(GiB)": 21.51, "elapsed_time": "4h 21m 41s", "remaining_time": "7h 15m 11s", "loss_scale": 1.0, "consumed_samples": 611584, "global_step/max_steps": "2389/6362"} +{"lm loss": 4.99005556, "grad_norm": 0.48979971, "learning_rate": 7.449e-05, "elapsed_time_per_iteration": 6.56557322, "memory(GiB)": 21.51, "elapsed_time": "4h 21m 47s", "remaining_time": "7h 15m 5s", "loss_scale": 1.0, "consumed_samples": 611840, "global_step/max_steps": "2390/6362"} +{"lm loss": 4.9736743, "grad_norm": 0.5468033, "learning_rate": 7.447e-05, "elapsed_time_per_iteration": 6.59767509, "memory(GiB)": 21.51, "elapsed_time": "4h 21m 54s", "remaining_time": "7h 14m 58s", "loss_scale": 1.0, "consumed_samples": 612096, "global_step/max_steps": "2391/6362"} +{"lm loss": 4.97645712, "grad_norm": 0.53055966, "learning_rate": 7.444e-05, "elapsed_time_per_iteration": 6.40613627, "memory(GiB)": 21.51, "elapsed_time": "4h 22m 0s", "remaining_time": "7h 14m 51s", "loss_scale": 1.0, "consumed_samples": 612352, "global_step/max_steps": "2392/6362"} +{"lm loss": 4.99088049, "grad_norm": 0.55651486, "learning_rate": 7.442e-05, "elapsed_time_per_iteration": 6.81723452, "memory(GiB)": 21.51, "elapsed_time": "4h 22m 7s", "remaining_time": "7h 14m 45s", "loss_scale": 1.0, "consumed_samples": 612608, "global_step/max_steps": "2393/6362"} +{"lm loss": 4.96765375, "grad_norm": 0.6808688, "learning_rate": 7.44e-05, "elapsed_time_per_iteration": 6.67533875, "memory(GiB)": 21.51, "elapsed_time": "4h 22m 14s", "remaining_time": "7h 14m 39s", "loss_scale": 1.0, "consumed_samples": 612864, "global_step/max_steps": "2394/6362"} +{"lm loss": 4.96049023, "grad_norm": 0.67156655, "learning_rate": 7.438e-05, "elapsed_time_per_iteration": 6.50067282, "memory(GiB)": 21.51, "elapsed_time": "4h 22m 20s", "remaining_time": "7h 14m 32s", "loss_scale": 1.0, "consumed_samples": 613120, "global_step/max_steps": "2395/6362"} +{"lm loss": 4.98364782, "grad_norm": 0.67583472, "learning_rate": 7.436e-05, "elapsed_time_per_iteration": 6.47994399, "memory(GiB)": 21.51, "elapsed_time": "4h 22m 27s", "remaining_time": "7h 14m 25s", "loss_scale": 1.0, "consumed_samples": 613376, "global_step/max_steps": "2396/6362"} +{"lm loss": 4.97504759, "grad_norm": 0.61852646, "learning_rate": 7.433e-05, "elapsed_time_per_iteration": 6.59248638, "memory(GiB)": 21.51, "elapsed_time": "4h 22m 33s", "remaining_time": "7h 14m 19s", "loss_scale": 1.0, "consumed_samples": 613632, "global_step/max_steps": "2397/6362"} +{"lm loss": 4.96415615, "grad_norm": 0.60029691, "learning_rate": 7.431e-05, "elapsed_time_per_iteration": 6.6332829, "memory(GiB)": 21.51, "elapsed_time": "4h 22m 40s", "remaining_time": "7h 14m 12s", "loss_scale": 1.0, "consumed_samples": 613888, "global_step/max_steps": "2398/6362"} +{"lm loss": 4.98123789, "grad_norm": 0.52881205, "learning_rate": 7.429e-05, "elapsed_time_per_iteration": 6.55978441, "memory(GiB)": 21.51, "elapsed_time": "4h 22m 47s", "remaining_time": "7h 14m 6s", "loss_scale": 1.0, "consumed_samples": 614144, "global_step/max_steps": "2399/6362"} +{"lm loss": 4.99997425, "grad_norm": 0.54378843, "learning_rate": 7.427e-05, "elapsed_time_per_iteration": 6.62029195, "memory(GiB)": 21.51, "elapsed_time": "4h 22m 53s", "remaining_time": "7h 13m 59s", "loss_scale": 1.0, "consumed_samples": 614400, "global_step/max_steps": "2400/6362"} +{"lm loss": 5.00067616, "grad_norm": 0.68338728, "learning_rate": 7.424e-05, "elapsed_time_per_iteration": 6.74718428, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 0s", "remaining_time": "7h 13m 53s", "loss_scale": 1.0, "consumed_samples": 614656, "global_step/max_steps": "2401/6362"} +{"lm loss": 5.00959969, "grad_norm": 0.68134499, "learning_rate": 7.422e-05, "elapsed_time_per_iteration": 6.62508464, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 7s", "remaining_time": "7h 13m 46s", "loss_scale": 1.0, "consumed_samples": 614912, "global_step/max_steps": "2402/6362"} +{"lm loss": 4.9855504, "grad_norm": 0.65048689, "learning_rate": 7.42e-05, "elapsed_time_per_iteration": 6.48101115, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 13s", "remaining_time": "7h 13m 40s", "loss_scale": 1.0, "consumed_samples": 615168, "global_step/max_steps": "2403/6362"} +{"lm loss": 4.98685074, "grad_norm": 0.6235202, "learning_rate": 7.418e-05, "elapsed_time_per_iteration": 6.46681643, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 20s", "remaining_time": "7h 13m 33s", "loss_scale": 1.0, "consumed_samples": 615424, "global_step/max_steps": "2404/6362"} +{"lm loss": 4.98839378, "grad_norm": 0.63378835, "learning_rate": 7.416e-05, "elapsed_time_per_iteration": 6.64896703, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 26s", "remaining_time": "7h 13m 27s", "loss_scale": 1.0, "consumed_samples": 615680, "global_step/max_steps": "2405/6362"} +{"lm loss": 4.99436617, "grad_norm": 0.68795329, "learning_rate": 7.413e-05, "elapsed_time_per_iteration": 6.57322049, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 33s", "remaining_time": "7h 13m 20s", "loss_scale": 1.0, "consumed_samples": 615936, "global_step/max_steps": "2406/6362"} +{"lm loss": 4.98748159, "grad_norm": 0.75562203, "learning_rate": 7.411e-05, "elapsed_time_per_iteration": 6.59865594, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 39s", "remaining_time": "7h 13m 13s", "loss_scale": 1.0, "consumed_samples": 616192, "global_step/max_steps": "2407/6362"} +{"lm loss": 5.00441837, "grad_norm": 0.76735252, "learning_rate": 7.409e-05, "elapsed_time_per_iteration": 6.51388597, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 46s", "remaining_time": "7h 13m 7s", "loss_scale": 1.0, "consumed_samples": 616448, "global_step/max_steps": "2408/6362"} +{"lm loss": 4.98270035, "grad_norm": 0.66995776, "learning_rate": 7.407e-05, "elapsed_time_per_iteration": 6.53113484, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 52s", "remaining_time": "7h 13m 0s", "loss_scale": 1.0, "consumed_samples": 616704, "global_step/max_steps": "2409/6362"} +{"lm loss": 4.97381544, "grad_norm": 0.62685287, "learning_rate": 7.404e-05, "elapsed_time_per_iteration": 6.53637552, "memory(GiB)": 21.51, "elapsed_time": "4h 23m 59s", "remaining_time": "7h 12m 54s", "loss_scale": 1.0, "consumed_samples": 616960, "global_step/max_steps": "2410/6362"} +{"lm loss": 4.99074411, "grad_norm": 0.6691851, "learning_rate": 7.402e-05, "elapsed_time_per_iteration": 6.60724354, "memory(GiB)": 21.51, "elapsed_time": "4h 24m 6s", "remaining_time": "7h 12m 47s", "loss_scale": 1.0, "consumed_samples": 617216, "global_step/max_steps": "2411/6362"} +{"lm loss": 4.98935461, "grad_norm": 0.65778768, "learning_rate": 7.4e-05, "elapsed_time_per_iteration": 6.60614657, "memory(GiB)": 21.51, "elapsed_time": "4h 24m 12s", "remaining_time": "7h 12m 40s", "loss_scale": 1.0, "consumed_samples": 617472, "global_step/max_steps": "2412/6362"} +{"lm loss": 4.97217798, "grad_norm": 0.62677079, "learning_rate": 7.398e-05, "elapsed_time_per_iteration": 6.6267271, "memory(GiB)": 21.51, "elapsed_time": "4h 24m 19s", "remaining_time": "7h 12m 34s", "loss_scale": 1.0, "consumed_samples": 617728, "global_step/max_steps": "2413/6362"} +{"lm loss": 4.98457527, "grad_norm": 0.59842604, "learning_rate": 7.395e-05, "elapsed_time_per_iteration": 6.74835014, "memory(GiB)": 21.51, "elapsed_time": "4h 24m 25s", "remaining_time": "7h 12m 28s", "loss_scale": 1.0, "consumed_samples": 617984, "global_step/max_steps": "2414/6362"} +{"lm loss": 4.97497797, "grad_norm": 0.62760943, "learning_rate": 7.393e-05, "elapsed_time_per_iteration": 6.7777288, "memory(GiB)": 21.51, "elapsed_time": "4h 24m 32s", "remaining_time": "7h 12m 21s", "loss_scale": 1.0, "consumed_samples": 618240, "global_step/max_steps": "2415/6362"} +{"lm loss": 5.00485563, "grad_norm": 0.61455697, "learning_rate": 7.391e-05, "elapsed_time_per_iteration": 6.65681958, "memory(GiB)": 21.51, "elapsed_time": "4h 24m 39s", "remaining_time": "7h 12m 15s", "loss_scale": 1.0, "consumed_samples": 618496, "global_step/max_steps": "2416/6362"} +{"lm loss": 5.0024004, "grad_norm": 0.6288963, "learning_rate": 7.389e-05, "elapsed_time_per_iteration": 6.59383035, "memory(GiB)": 21.51, "elapsed_time": "4h 24m 46s", "remaining_time": "7h 12m 8s", "loss_scale": 1.0, "consumed_samples": 618752, "global_step/max_steps": "2417/6362"} +{"lm loss": 4.98034906, "grad_norm": 0.63898921, "learning_rate": 7.387e-05, "elapsed_time_per_iteration": 6.59871769, "memory(GiB)": 21.51, "elapsed_time": "4h 24m 52s", "remaining_time": "7h 12m 2s", "loss_scale": 1.0, "consumed_samples": 619008, "global_step/max_steps": "2418/6362"} +{"lm loss": 4.97401762, "grad_norm": 0.67429465, "learning_rate": 7.384e-05, "elapsed_time_per_iteration": 6.45171189, "memory(GiB)": 21.51, "elapsed_time": "4h 24m 59s", "remaining_time": "7h 11m 55s", "loss_scale": 1.0, "consumed_samples": 619264, "global_step/max_steps": "2419/6362"} +{"lm loss": 4.95097208, "grad_norm": 0.62358117, "learning_rate": 7.382e-05, "elapsed_time_per_iteration": 6.65499687, "memory(GiB)": 21.51, "elapsed_time": "4h 25m 5s", "remaining_time": "7h 11m 49s", "loss_scale": 1.0, "consumed_samples": 619520, "global_step/max_steps": "2420/6362"} +{"lm loss": 4.9903698, "grad_norm": 0.53425241, "learning_rate": 7.38e-05, "elapsed_time_per_iteration": 6.79724121, "memory(GiB)": 21.51, "elapsed_time": "4h 25m 12s", "remaining_time": "7h 11m 43s", "loss_scale": 1.0, "consumed_samples": 619776, "global_step/max_steps": "2421/6362"} +{"lm loss": 5.00248671, "grad_norm": 0.61684906, "learning_rate": 7.378e-05, "elapsed_time_per_iteration": 6.58848023, "memory(GiB)": 21.51, "elapsed_time": "4h 25m 19s", "remaining_time": "7h 11m 36s", "loss_scale": 1.0, "consumed_samples": 620032, "global_step/max_steps": "2422/6362"} +{"lm loss": 4.97339535, "grad_norm": 0.5774101, "learning_rate": 7.375e-05, "elapsed_time_per_iteration": 6.3165369, "memory(GiB)": 21.51, "elapsed_time": "4h 25m 25s", "remaining_time": "7h 11m 29s", "loss_scale": 1.0, "consumed_samples": 620288, "global_step/max_steps": "2423/6362"} +{"lm loss": 4.97985697, "grad_norm": 0.56622511, "learning_rate": 7.373e-05, "elapsed_time_per_iteration": 6.67478108, "memory(GiB)": 21.51, "elapsed_time": "4h 25m 32s", "remaining_time": "7h 11m 23s", "loss_scale": 1.0, "consumed_samples": 620544, "global_step/max_steps": "2424/6362"} +{"lm loss": 4.98567152, "grad_norm": 0.51157337, "learning_rate": 7.371e-05, "elapsed_time_per_iteration": 6.60682011, "memory(GiB)": 21.51, "elapsed_time": "4h 25m 38s", "remaining_time": "7h 11m 16s", "loss_scale": 1.0, "consumed_samples": 620800, "global_step/max_steps": "2425/6362"} +{"lm loss": 4.97465086, "grad_norm": 0.54320371, "learning_rate": 7.369e-05, "elapsed_time_per_iteration": 6.55431604, "memory(GiB)": 21.51, "elapsed_time": "4h 25m 45s", "remaining_time": "7h 11m 9s", "loss_scale": 1.0, "consumed_samples": 621056, "global_step/max_steps": "2426/6362"} +{"lm loss": 4.99932194, "grad_norm": 0.60039926, "learning_rate": 7.366e-05, "elapsed_time_per_iteration": 6.51048374, "memory(GiB)": 21.51, "elapsed_time": "4h 25m 51s", "remaining_time": "7h 11m 3s", "loss_scale": 1.0, "consumed_samples": 621312, "global_step/max_steps": "2427/6362"} +{"lm loss": 4.97440863, "grad_norm": 0.59056383, "learning_rate": 7.364e-05, "elapsed_time_per_iteration": 6.64036345, "memory(GiB)": 21.51, "elapsed_time": "4h 25m 58s", "remaining_time": "7h 10m 56s", "loss_scale": 1.0, "consumed_samples": 621568, "global_step/max_steps": "2428/6362"} +{"lm loss": 4.99506855, "grad_norm": 0.63312954, "learning_rate": 7.362e-05, "elapsed_time_per_iteration": 6.5182178, "memory(GiB)": 21.51, "elapsed_time": "4h 26m 4s", "remaining_time": "7h 10m 50s", "loss_scale": 1.0, "consumed_samples": 621824, "global_step/max_steps": "2429/6362"} +{"lm loss": 4.98950815, "grad_norm": 0.75180662, "learning_rate": 7.36e-05, "elapsed_time_per_iteration": 6.41320491, "memory(GiB)": 21.51, "elapsed_time": "4h 26m 11s", "remaining_time": "7h 10m 43s", "loss_scale": 1.0, "consumed_samples": 622080, "global_step/max_steps": "2430/6362"} +{"lm loss": 4.99710703, "grad_norm": 0.69677687, "learning_rate": 7.357e-05, "elapsed_time_per_iteration": 6.2794311, "memory(GiB)": 21.51, "elapsed_time": "4h 26m 17s", "remaining_time": "7h 10m 36s", "loss_scale": 1.0, "consumed_samples": 622336, "global_step/max_steps": "2431/6362"} +{"lm loss": 4.9678216, "grad_norm": 0.57600868, "learning_rate": 7.355e-05, "elapsed_time_per_iteration": 6.54291224, "memory(GiB)": 21.51, "elapsed_time": "4h 26m 24s", "remaining_time": "7h 10m 29s", "loss_scale": 1.0, "consumed_samples": 622592, "global_step/max_steps": "2432/6362"} +{"lm loss": 4.97027016, "grad_norm": 0.6184262, "learning_rate": 7.353e-05, "elapsed_time_per_iteration": 6.54417253, "memory(GiB)": 21.51, "elapsed_time": "4h 26m 30s", "remaining_time": "7h 10m 23s", "loss_scale": 1.0, "consumed_samples": 622848, "global_step/max_steps": "2433/6362"} +{"lm loss": 4.97200966, "grad_norm": 0.62500989, "learning_rate": 7.351e-05, "elapsed_time_per_iteration": 6.68487263, "memory(GiB)": 21.51, "elapsed_time": "4h 26m 37s", "remaining_time": "7h 10m 16s", "loss_scale": 1.0, "consumed_samples": 623104, "global_step/max_steps": "2434/6362"} +{"lm loss": 4.96611738, "grad_norm": 0.72975308, "learning_rate": 7.348e-05, "elapsed_time_per_iteration": 6.50947213, "memory(GiB)": 21.51, "elapsed_time": "4h 26m 43s", "remaining_time": "7h 10m 10s", "loss_scale": 1.0, "consumed_samples": 623360, "global_step/max_steps": "2435/6362"} +{"lm loss": 4.98981619, "grad_norm": 0.87233073, "learning_rate": 7.346e-05, "elapsed_time_per_iteration": 6.19889426, "memory(GiB)": 21.51, "elapsed_time": "4h 26m 50s", "remaining_time": "7h 10m 2s", "loss_scale": 1.0, "consumed_samples": 623616, "global_step/max_steps": "2436/6362"} +{"lm loss": 4.98698378, "grad_norm": 0.92430532, "learning_rate": 7.344e-05, "elapsed_time_per_iteration": 6.35013223, "memory(GiB)": 21.51, "elapsed_time": "4h 26m 56s", "remaining_time": "7h 9m 55s", "loss_scale": 1.0, "consumed_samples": 623872, "global_step/max_steps": "2437/6362"} +{"lm loss": 4.97930384, "grad_norm": 0.81858164, "learning_rate": 7.342e-05, "elapsed_time_per_iteration": 6.44593859, "memory(GiB)": 21.51, "elapsed_time": "4h 27m 2s", "remaining_time": "7h 9m 49s", "loss_scale": 1.0, "consumed_samples": 624128, "global_step/max_steps": "2438/6362"} +{"lm loss": 4.98327732, "grad_norm": 0.62799776, "learning_rate": 7.339e-05, "elapsed_time_per_iteration": 6.5120008, "memory(GiB)": 21.51, "elapsed_time": "4h 27m 9s", "remaining_time": "7h 9m 42s", "loss_scale": 1.0, "consumed_samples": 624384, "global_step/max_steps": "2439/6362"} +{"lm loss": 4.98555326, "grad_norm": 0.57516241, "learning_rate": 7.337e-05, "elapsed_time_per_iteration": 6.58858228, "memory(GiB)": 21.51, "elapsed_time": "4h 27m 16s", "remaining_time": "7h 9m 35s", "loss_scale": 1.0, "consumed_samples": 624640, "global_step/max_steps": "2440/6362"} +{"lm loss": 4.99412107, "grad_norm": 0.70103335, "learning_rate": 7.335e-05, "elapsed_time_per_iteration": 6.47602487, "memory(GiB)": 21.51, "elapsed_time": "4h 27m 22s", "remaining_time": "7h 9m 29s", "loss_scale": 1.0, "consumed_samples": 624896, "global_step/max_steps": "2441/6362"} +{"lm loss": 4.97449303, "grad_norm": 0.74127638, "learning_rate": 7.333e-05, "elapsed_time_per_iteration": 6.51932526, "memory(GiB)": 21.51, "elapsed_time": "4h 27m 29s", "remaining_time": "7h 9m 22s", "loss_scale": 1.0, "consumed_samples": 625152, "global_step/max_steps": "2442/6362"} +{"lm loss": 4.98844719, "grad_norm": 0.54816157, "learning_rate": 7.33e-05, "elapsed_time_per_iteration": 6.64385033, "memory(GiB)": 21.51, "elapsed_time": "4h 27m 35s", "remaining_time": "7h 9m 16s", "loss_scale": 1.0, "consumed_samples": 625408, "global_step/max_steps": "2443/6362"} +{"lm loss": 4.96164227, "grad_norm": 0.62641382, "learning_rate": 7.328e-05, "elapsed_time_per_iteration": 6.46368456, "memory(GiB)": 21.51, "elapsed_time": "4h 27m 42s", "remaining_time": "7h 9m 9s", "loss_scale": 1.0, "consumed_samples": 625664, "global_step/max_steps": "2444/6362"} +{"lm loss": 4.98892212, "grad_norm": 0.71051133, "learning_rate": 7.326e-05, "elapsed_time_per_iteration": 6.7437098, "memory(GiB)": 21.51, "elapsed_time": "4h 27m 48s", "remaining_time": "7h 9m 3s", "loss_scale": 1.0, "consumed_samples": 625920, "global_step/max_steps": "2445/6362"} +{"lm loss": 4.97344637, "grad_norm": 0.66659403, "learning_rate": 7.324e-05, "elapsed_time_per_iteration": 6.44726133, "memory(GiB)": 21.51, "elapsed_time": "4h 27m 55s", "remaining_time": "7h 8m 56s", "loss_scale": 1.0, "consumed_samples": 626176, "global_step/max_steps": "2446/6362"} +{"lm loss": 4.97695112, "grad_norm": 0.61632997, "learning_rate": 7.321e-05, "elapsed_time_per_iteration": 6.98405099, "memory(GiB)": 21.51, "elapsed_time": "4h 28m 2s", "remaining_time": "7h 8m 50s", "loss_scale": 1.0, "consumed_samples": 626432, "global_step/max_steps": "2447/6362"} +{"lm loss": 4.98294401, "grad_norm": 0.55414206, "learning_rate": 7.319e-05, "elapsed_time_per_iteration": 6.69093585, "memory(GiB)": 21.51, "elapsed_time": "4h 28m 8s", "remaining_time": "7h 8m 43s", "loss_scale": 1.0, "consumed_samples": 626688, "global_step/max_steps": "2448/6362"} +{"lm loss": 4.97887373, "grad_norm": 0.57833523, "learning_rate": 7.317e-05, "elapsed_time_per_iteration": 6.56314087, "memory(GiB)": 21.51, "elapsed_time": "4h 28m 15s", "remaining_time": "7h 8m 37s", "loss_scale": 1.0, "consumed_samples": 626944, "global_step/max_steps": "2449/6362"} +{"lm loss": 4.99047899, "grad_norm": 0.60591561, "learning_rate": 7.315e-05, "elapsed_time_per_iteration": 6.57341814, "memory(GiB)": 21.51, "elapsed_time": "4h 28m 22s", "remaining_time": "7h 8m 30s", "loss_scale": 1.0, "consumed_samples": 627200, "global_step/max_steps": "2450/6362"} +{"lm loss": 4.96289921, "grad_norm": 0.54219562, "learning_rate": 7.312e-05, "elapsed_time_per_iteration": 6.52870703, "memory(GiB)": 21.51, "elapsed_time": "4h 28m 28s", "remaining_time": "7h 8m 24s", "loss_scale": 1.0, "consumed_samples": 627456, "global_step/max_steps": "2451/6362"} +{"lm loss": 4.96649694, "grad_norm": 0.61182344, "learning_rate": 7.31e-05, "elapsed_time_per_iteration": 6.54346299, "memory(GiB)": 21.51, "elapsed_time": "4h 28m 35s", "remaining_time": "7h 8m 17s", "loss_scale": 1.0, "consumed_samples": 627712, "global_step/max_steps": "2452/6362"} +{"lm loss": 4.97054291, "grad_norm": 0.66438466, "learning_rate": 7.308e-05, "elapsed_time_per_iteration": 6.55791879, "memory(GiB)": 21.51, "elapsed_time": "4h 28m 41s", "remaining_time": "7h 8m 10s", "loss_scale": 1.0, "consumed_samples": 627968, "global_step/max_steps": "2453/6362"} +{"lm loss": 4.98479176, "grad_norm": 0.62807012, "learning_rate": 7.306e-05, "elapsed_time_per_iteration": 6.62072086, "memory(GiB)": 21.51, "elapsed_time": "4h 28m 48s", "remaining_time": "7h 8m 4s", "loss_scale": 1.0, "consumed_samples": 628224, "global_step/max_steps": "2454/6362"} +{"lm loss": 4.99266481, "grad_norm": 0.53653586, "learning_rate": 7.303e-05, "elapsed_time_per_iteration": 6.68040824, "memory(GiB)": 21.51, "elapsed_time": "4h 28m 55s", "remaining_time": "7h 7m 58s", "loss_scale": 1.0, "consumed_samples": 628480, "global_step/max_steps": "2455/6362"} +{"lm loss": 4.96864033, "grad_norm": 0.55102313, "learning_rate": 7.301e-05, "elapsed_time_per_iteration": 6.85492182, "memory(GiB)": 21.51, "elapsed_time": "4h 29m 1s", "remaining_time": "7h 7m 51s", "loss_scale": 1.0, "consumed_samples": 628736, "global_step/max_steps": "2456/6362"} +{"lm loss": 4.97830915, "grad_norm": 0.6213004, "learning_rate": 7.299e-05, "elapsed_time_per_iteration": 6.66226459, "memory(GiB)": 21.51, "elapsed_time": "4h 29m 8s", "remaining_time": "7h 7m 45s", "loss_scale": 1.0, "consumed_samples": 628992, "global_step/max_steps": "2457/6362"} +{"lm loss": 4.98053837, "grad_norm": 0.6172626, "learning_rate": 7.297e-05, "elapsed_time_per_iteration": 6.67506337, "memory(GiB)": 21.51, "elapsed_time": "4h 29m 15s", "remaining_time": "7h 7m 39s", "loss_scale": 1.0, "consumed_samples": 629248, "global_step/max_steps": "2458/6362"} +{"lm loss": 4.9823451, "grad_norm": 0.50743902, "learning_rate": 7.294e-05, "elapsed_time_per_iteration": 6.68874145, "memory(GiB)": 21.51, "elapsed_time": "4h 29m 21s", "remaining_time": "7h 7m 32s", "loss_scale": 1.0, "consumed_samples": 629504, "global_step/max_steps": "2459/6362"} +{"lm loss": 4.99113369, "grad_norm": 0.51667202, "learning_rate": 7.292e-05, "elapsed_time_per_iteration": 6.72812629, "memory(GiB)": 21.51, "elapsed_time": "4h 29m 28s", "remaining_time": "7h 7m 26s", "loss_scale": 1.0, "consumed_samples": 629760, "global_step/max_steps": "2460/6362"} +{"lm loss": 4.99650669, "grad_norm": 0.59441626, "learning_rate": 7.29e-05, "elapsed_time_per_iteration": 6.48877692, "memory(GiB)": 21.51, "elapsed_time": "4h 29m 35s", "remaining_time": "7h 7m 19s", "loss_scale": 1.0, "consumed_samples": 630016, "global_step/max_steps": "2461/6362"} +{"lm loss": 4.98832798, "grad_norm": 0.61812794, "learning_rate": 7.288e-05, "elapsed_time_per_iteration": 6.47547674, "memory(GiB)": 21.51, "elapsed_time": "4h 29m 41s", "remaining_time": "7h 7m 12s", "loss_scale": 1.0, "consumed_samples": 630272, "global_step/max_steps": "2462/6362"} +{"lm loss": 4.98882818, "grad_norm": 0.61975676, "learning_rate": 7.285e-05, "elapsed_time_per_iteration": 6.50945711, "memory(GiB)": 21.51, "elapsed_time": "4h 29m 48s", "remaining_time": "7h 7m 6s", "loss_scale": 1.0, "consumed_samples": 630528, "global_step/max_steps": "2463/6362"} +{"lm loss": 4.99424982, "grad_norm": 0.59615445, "learning_rate": 7.283e-05, "elapsed_time_per_iteration": 6.60701466, "memory(GiB)": 21.51, "elapsed_time": "4h 29m 54s", "remaining_time": "7h 6m 59s", "loss_scale": 1.0, "consumed_samples": 630784, "global_step/max_steps": "2464/6362"} +{"lm loss": 4.97371387, "grad_norm": 0.6115244, "learning_rate": 7.281e-05, "elapsed_time_per_iteration": 6.71882248, "memory(GiB)": 21.51, "elapsed_time": "4h 30m 1s", "remaining_time": "7h 6m 53s", "loss_scale": 1.0, "consumed_samples": 631040, "global_step/max_steps": "2465/6362"} +{"lm loss": 4.99076986, "grad_norm": 0.67522311, "learning_rate": 7.278e-05, "elapsed_time_per_iteration": 6.77187848, "memory(GiB)": 21.51, "elapsed_time": "4h 30m 8s", "remaining_time": "7h 6m 47s", "loss_scale": 1.0, "consumed_samples": 631296, "global_step/max_steps": "2466/6362"} +{"lm loss": 5.0062294, "grad_norm": 0.74665654, "learning_rate": 7.276e-05, "elapsed_time_per_iteration": 6.54647565, "memory(GiB)": 21.51, "elapsed_time": "4h 30m 14s", "remaining_time": "7h 6m 40s", "loss_scale": 1.0, "consumed_samples": 631552, "global_step/max_steps": "2467/6362"} +{"lm loss": 4.97291946, "grad_norm": 0.78048587, "learning_rate": 7.274e-05, "elapsed_time_per_iteration": 6.43574548, "memory(GiB)": 21.51, "elapsed_time": "4h 30m 21s", "remaining_time": "7h 6m 33s", "loss_scale": 1.0, "consumed_samples": 631808, "global_step/max_steps": "2468/6362"} +{"lm loss": 4.98056984, "grad_norm": 0.68445927, "learning_rate": 7.272e-05, "elapsed_time_per_iteration": 6.43148828, "memory(GiB)": 21.51, "elapsed_time": "4h 30m 27s", "remaining_time": "7h 6m 26s", "loss_scale": 1.0, "consumed_samples": 632064, "global_step/max_steps": "2469/6362"} +{"lm loss": 4.98954439, "grad_norm": 0.55122387, "learning_rate": 7.269e-05, "elapsed_time_per_iteration": 6.63316274, "memory(GiB)": 21.51, "elapsed_time": "4h 30m 34s", "remaining_time": "7h 6m 20s", "loss_scale": 1.0, "consumed_samples": 632320, "global_step/max_steps": "2470/6362"} +{"lm loss": 4.95957518, "grad_norm": 0.59475362, "learning_rate": 7.267e-05, "elapsed_time_per_iteration": 6.54732728, "memory(GiB)": 21.51, "elapsed_time": "4h 30m 40s", "remaining_time": "7h 6m 13s", "loss_scale": 1.0, "consumed_samples": 632576, "global_step/max_steps": "2471/6362"} +{"lm loss": 4.95978308, "grad_norm": 0.58748168, "learning_rate": 7.265e-05, "elapsed_time_per_iteration": 6.61699939, "memory(GiB)": 21.51, "elapsed_time": "4h 30m 47s", "remaining_time": "7h 6m 7s", "loss_scale": 1.0, "consumed_samples": 632832, "global_step/max_steps": "2472/6362"} +{"lm loss": 4.99161434, "grad_norm": 0.55659401, "learning_rate": 7.263e-05, "elapsed_time_per_iteration": 6.4830358, "memory(GiB)": 21.51, "elapsed_time": "4h 30m 53s", "remaining_time": "7h 6m 0s", "loss_scale": 1.0, "consumed_samples": 633088, "global_step/max_steps": "2473/6362"} +{"lm loss": 4.96929169, "grad_norm": 0.57620478, "learning_rate": 7.26e-05, "elapsed_time_per_iteration": 6.47480989, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 0s", "remaining_time": "7h 5m 53s", "loss_scale": 1.0, "consumed_samples": 633344, "global_step/max_steps": "2474/6362"} +{"lm loss": 4.97571707, "grad_norm": 0.52938825, "learning_rate": 7.258e-05, "elapsed_time_per_iteration": 6.78256369, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 7s", "remaining_time": "7h 5m 47s", "loss_scale": 1.0, "consumed_samples": 633600, "global_step/max_steps": "2475/6362"} +{"lm loss": 5.00154352, "grad_norm": 0.53449559, "learning_rate": 7.256e-05, "elapsed_time_per_iteration": 6.4659245, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 13s", "remaining_time": "7h 5m 40s", "loss_scale": 1.0, "consumed_samples": 633856, "global_step/max_steps": "2476/6362"} +{"lm loss": 4.98099661, "grad_norm": 0.61991048, "learning_rate": 7.254e-05, "elapsed_time_per_iteration": 6.50728154, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 20s", "remaining_time": "7h 5m 34s", "loss_scale": 1.0, "consumed_samples": 634112, "global_step/max_steps": "2477/6362"} +{"lm loss": 5.00573349, "grad_norm": 0.68090802, "learning_rate": 7.251e-05, "elapsed_time_per_iteration": 6.96005034, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 27s", "remaining_time": "7h 5m 28s", "loss_scale": 1.0, "consumed_samples": 634368, "global_step/max_steps": "2478/6362"} +{"lm loss": 4.96595144, "grad_norm": 0.72013575, "learning_rate": 7.249e-05, "elapsed_time_per_iteration": 6.57474256, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 33s", "remaining_time": "7h 5m 21s", "loss_scale": 1.0, "consumed_samples": 634624, "global_step/max_steps": "2479/6362"} +{"lm loss": 4.98847246, "grad_norm": 0.75649321, "learning_rate": 7.247e-05, "elapsed_time_per_iteration": 6.5313561, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 40s", "remaining_time": "7h 5m 15s", "loss_scale": 1.0, "consumed_samples": 634880, "global_step/max_steps": "2480/6362"} +{"lm loss": 4.97901106, "grad_norm": 0.64556587, "learning_rate": 7.244e-05, "elapsed_time_per_iteration": 6.54204345, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 46s", "remaining_time": "7h 5m 8s", "loss_scale": 1.0, "consumed_samples": 635136, "global_step/max_steps": "2481/6362"} +{"lm loss": 4.99078321, "grad_norm": 0.55624539, "learning_rate": 7.242e-05, "elapsed_time_per_iteration": 6.49592423, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 53s", "remaining_time": "7h 5m 1s", "loss_scale": 1.0, "consumed_samples": 635392, "global_step/max_steps": "2482/6362"} +{"lm loss": 4.9974308, "grad_norm": 0.59009552, "learning_rate": 7.24e-05, "elapsed_time_per_iteration": 6.63844085, "memory(GiB)": 21.51, "elapsed_time": "4h 31m 59s", "remaining_time": "7h 4m 55s", "loss_scale": 1.0, "consumed_samples": 635648, "global_step/max_steps": "2483/6362"} +{"lm loss": 4.99148083, "grad_norm": 0.6794163, "learning_rate": 7.238e-05, "elapsed_time_per_iteration": 6.44980454, "memory(GiB)": 21.51, "elapsed_time": "4h 32m 6s", "remaining_time": "7h 4m 48s", "loss_scale": 1.0, "consumed_samples": 635904, "global_step/max_steps": "2484/6362"} +{"lm loss": 4.97760773, "grad_norm": 0.62710649, "learning_rate": 7.235e-05, "elapsed_time_per_iteration": 6.71743226, "memory(GiB)": 21.51, "elapsed_time": "4h 32m 13s", "remaining_time": "7h 4m 42s", "loss_scale": 1.0, "consumed_samples": 636160, "global_step/max_steps": "2485/6362"} +{"lm loss": 4.96933079, "grad_norm": 0.5375573, "learning_rate": 7.233e-05, "elapsed_time_per_iteration": 6.62370086, "memory(GiB)": 21.51, "elapsed_time": "4h 32m 19s", "remaining_time": "7h 4m 35s", "loss_scale": 1.0, "consumed_samples": 636416, "global_step/max_steps": "2486/6362"} +{"lm loss": 4.96898556, "grad_norm": 0.57499719, "learning_rate": 7.231e-05, "elapsed_time_per_iteration": 6.54847074, "memory(GiB)": 21.51, "elapsed_time": "4h 32m 26s", "remaining_time": "7h 4m 29s", "loss_scale": 1.0, "consumed_samples": 636672, "global_step/max_steps": "2487/6362"} +{"lm loss": 4.97085238, "grad_norm": 0.58899778, "learning_rate": 7.228e-05, "elapsed_time_per_iteration": 6.49185014, "memory(GiB)": 21.51, "elapsed_time": "4h 32m 32s", "remaining_time": "7h 4m 22s", "loss_scale": 1.0, "consumed_samples": 636928, "global_step/max_steps": "2488/6362"} +{"lm loss": 4.96027899, "grad_norm": 0.58062291, "learning_rate": 7.226e-05, "elapsed_time_per_iteration": 6.37499619, "memory(GiB)": 21.51, "elapsed_time": "4h 32m 39s", "remaining_time": "7h 4m 15s", "loss_scale": 1.0, "consumed_samples": 637184, "global_step/max_steps": "2489/6362"} +{"lm loss": 4.97884274, "grad_norm": 0.51455373, "learning_rate": 7.224e-05, "elapsed_time_per_iteration": 6.59000945, "memory(GiB)": 21.51, "elapsed_time": "4h 32m 45s", "remaining_time": "7h 4m 8s", "loss_scale": 1.0, "consumed_samples": 637440, "global_step/max_steps": "2490/6362"} +{"lm loss": 4.97260189, "grad_norm": 0.51411349, "learning_rate": 7.222e-05, "elapsed_time_per_iteration": 6.63350773, "memory(GiB)": 21.51, "elapsed_time": "4h 32m 52s", "remaining_time": "7h 4m 2s", "loss_scale": 1.0, "consumed_samples": 637696, "global_step/max_steps": "2491/6362"} +{"lm loss": 5.00898361, "grad_norm": 0.53027064, "learning_rate": 7.219e-05, "elapsed_time_per_iteration": 6.70715833, "memory(GiB)": 21.51, "elapsed_time": "4h 32m 59s", "remaining_time": "7h 3m 56s", "loss_scale": 1.0, "consumed_samples": 637952, "global_step/max_steps": "2492/6362"} +{"lm loss": 4.98186493, "grad_norm": 0.60074556, "learning_rate": 7.217e-05, "elapsed_time_per_iteration": 6.45905399, "memory(GiB)": 21.51, "elapsed_time": "4h 33m 5s", "remaining_time": "7h 3m 49s", "loss_scale": 1.0, "consumed_samples": 638208, "global_step/max_steps": "2493/6362"} +{"lm loss": 4.96509695, "grad_norm": 0.69935638, "learning_rate": 7.215e-05, "elapsed_time_per_iteration": 6.28055787, "memory(GiB)": 21.51, "elapsed_time": "4h 33m 11s", "remaining_time": "7h 3m 42s", "loss_scale": 1.0, "consumed_samples": 638464, "global_step/max_steps": "2494/6362"} +{"lm loss": 4.96671963, "grad_norm": 0.76260746, "learning_rate": 7.213e-05, "elapsed_time_per_iteration": 6.62346482, "memory(GiB)": 21.51, "elapsed_time": "4h 33m 18s", "remaining_time": "7h 3m 35s", "loss_scale": 1.0, "consumed_samples": 638720, "global_step/max_steps": "2495/6362"} +{"lm loss": 4.97730541, "grad_norm": 0.73778844, "learning_rate": 7.21e-05, "elapsed_time_per_iteration": 6.55974221, "memory(GiB)": 21.51, "elapsed_time": "4h 33m 24s", "remaining_time": "7h 3m 29s", "loss_scale": 1.0, "consumed_samples": 638976, "global_step/max_steps": "2496/6362"} +{"lm loss": 5.00334406, "grad_norm": 0.62707084, "learning_rate": 7.208e-05, "elapsed_time_per_iteration": 6.50035119, "memory(GiB)": 21.51, "elapsed_time": "4h 33m 31s", "remaining_time": "7h 3m 22s", "loss_scale": 1.0, "consumed_samples": 639232, "global_step/max_steps": "2497/6362"} +{"lm loss": 4.99841785, "grad_norm": 0.60276359, "learning_rate": 7.206e-05, "elapsed_time_per_iteration": 6.51253796, "memory(GiB)": 21.51, "elapsed_time": "4h 33m 37s", "remaining_time": "7h 3m 15s", "loss_scale": 1.0, "consumed_samples": 639488, "global_step/max_steps": "2498/6362"} +{"lm loss": 4.98929596, "grad_norm": 0.6893903, "learning_rate": 7.203e-05, "elapsed_time_per_iteration": 6.42200041, "memory(GiB)": 21.51, "elapsed_time": "4h 33m 44s", "remaining_time": "7h 3m 9s", "loss_scale": 1.0, "consumed_samples": 639744, "global_step/max_steps": "2499/6362"} +{"lm loss": 4.98198938, "grad_norm": 0.76683027, "learning_rate": 7.201e-05, "elapsed_time_per_iteration": 6.61655354, "memory(GiB)": 21.51, "elapsed_time": "4h 33m 51s", "remaining_time": "7h 3m 2s", "loss_scale": 1.0, "consumed_samples": 640000, "global_step/max_steps": "2500/6362"} +{"lm loss": 4.97338676, "grad_norm": 0.72913915, "learning_rate": 7.199e-05, "elapsed_time_per_iteration": 6.57136941, "memory(GiB)": 21.51, "elapsed_time": "4h 33m 57s", "remaining_time": "7h 2m 56s", "loss_scale": 1.0, "consumed_samples": 640256, "global_step/max_steps": "2501/6362"} +{"lm loss": 4.97049522, "grad_norm": 0.67724776, "learning_rate": 7.197e-05, "elapsed_time_per_iteration": 6.62837148, "memory(GiB)": 21.51, "elapsed_time": "4h 34m 4s", "remaining_time": "7h 2m 49s", "loss_scale": 1.0, "consumed_samples": 640512, "global_step/max_steps": "2502/6362"} +{"lm loss": 4.97636652, "grad_norm": 0.66372645, "learning_rate": 7.194e-05, "elapsed_time_per_iteration": 6.44782662, "memory(GiB)": 21.51, "elapsed_time": "4h 34m 10s", "remaining_time": "7h 2m 42s", "loss_scale": 1.0, "consumed_samples": 640768, "global_step/max_steps": "2503/6362"} +{"lm loss": 4.9797101, "grad_norm": 0.62542164, "learning_rate": 7.192e-05, "elapsed_time_per_iteration": 6.53498149, "memory(GiB)": 21.51, "elapsed_time": "4h 34m 17s", "remaining_time": "7h 2m 36s", "loss_scale": 1.0, "consumed_samples": 641024, "global_step/max_steps": "2504/6362"} +{"lm loss": 4.98035669, "grad_norm": 0.63794947, "learning_rate": 7.19e-05, "elapsed_time_per_iteration": 6.38328242, "memory(GiB)": 21.51, "elapsed_time": "4h 34m 23s", "remaining_time": "7h 2m 29s", "loss_scale": 1.0, "consumed_samples": 641280, "global_step/max_steps": "2505/6362"} +{"lm loss": 4.98707247, "grad_norm": 0.64315778, "learning_rate": 7.187e-05, "elapsed_time_per_iteration": 6.46488547, "memory(GiB)": 21.51, "elapsed_time": "4h 34m 30s", "remaining_time": "7h 2m 22s", "loss_scale": 1.0, "consumed_samples": 641536, "global_step/max_steps": "2506/6362"} +{"lm loss": 5.0077424, "grad_norm": 0.68146068, "learning_rate": 7.185e-05, "elapsed_time_per_iteration": 6.4350307, "memory(GiB)": 21.51, "elapsed_time": "4h 34m 36s", "remaining_time": "7h 2m 15s", "loss_scale": 1.0, "consumed_samples": 641792, "global_step/max_steps": "2507/6362"} +{"lm loss": 4.9809804, "grad_norm": 0.54787844, "learning_rate": 7.183e-05, "elapsed_time_per_iteration": 6.59752345, "memory(GiB)": 21.51, "elapsed_time": "4h 34m 43s", "remaining_time": "7h 2m 9s", "loss_scale": 1.0, "consumed_samples": 642048, "global_step/max_steps": "2508/6362"} +{"lm loss": 4.98727417, "grad_norm": 0.48615369, "learning_rate": 7.181e-05, "elapsed_time_per_iteration": 6.68722153, "memory(GiB)": 21.51, "elapsed_time": "4h 34m 49s", "remaining_time": "7h 2m 2s", "loss_scale": 1.0, "consumed_samples": 642304, "global_step/max_steps": "2509/6362"} +{"lm loss": 4.96695518, "grad_norm": 0.52380502, "learning_rate": 7.178e-05, "elapsed_time_per_iteration": 6.45033026, "memory(GiB)": 21.51, "elapsed_time": "4h 34m 56s", "remaining_time": "7h 1m 56s", "loss_scale": 1.0, "consumed_samples": 642560, "global_step/max_steps": "2510/6362"} +{"lm loss": 4.98702717, "grad_norm": 0.60166049, "learning_rate": 7.176e-05, "elapsed_time_per_iteration": 6.51112103, "memory(GiB)": 21.51, "elapsed_time": "4h 35m 2s", "remaining_time": "7h 1m 49s", "loss_scale": 1.0, "consumed_samples": 642816, "global_step/max_steps": "2511/6362"} +{"lm loss": 5.00231647, "grad_norm": 0.55391347, "learning_rate": 7.174e-05, "elapsed_time_per_iteration": 6.50938702, "memory(GiB)": 21.51, "elapsed_time": "4h 35m 9s", "remaining_time": "7h 1m 42s", "loss_scale": 1.0, "consumed_samples": 643072, "global_step/max_steps": "2512/6362"} +{"lm loss": 4.96598577, "grad_norm": 0.56792432, "learning_rate": 7.171e-05, "elapsed_time_per_iteration": 6.45580435, "memory(GiB)": 21.51, "elapsed_time": "4h 35m 15s", "remaining_time": "7h 1m 36s", "loss_scale": 1.0, "consumed_samples": 643328, "global_step/max_steps": "2513/6362"} +{"lm loss": 4.98931408, "grad_norm": 0.56944805, "learning_rate": 7.169e-05, "elapsed_time_per_iteration": 6.59365225, "memory(GiB)": 21.51, "elapsed_time": "4h 35m 22s", "remaining_time": "7h 1m 29s", "loss_scale": 1.0, "consumed_samples": 643584, "global_step/max_steps": "2514/6362"} +{"lm loss": 4.96385574, "grad_norm": 0.64610237, "learning_rate": 7.167e-05, "elapsed_time_per_iteration": 6.47165489, "memory(GiB)": 21.51, "elapsed_time": "4h 35m 28s", "remaining_time": "7h 1m 22s", "loss_scale": 1.0, "consumed_samples": 643840, "global_step/max_steps": "2515/6362"} +{"lm loss": 4.96107006, "grad_norm": 0.71296376, "learning_rate": 7.164e-05, "elapsed_time_per_iteration": 6.61706495, "memory(GiB)": 21.51, "elapsed_time": "4h 35m 35s", "remaining_time": "7h 1m 16s", "loss_scale": 1.0, "consumed_samples": 644096, "global_step/max_steps": "2516/6362"} +{"lm loss": 4.9663806, "grad_norm": 0.80027092, "learning_rate": 7.162e-05, "elapsed_time_per_iteration": 6.54437947, "memory(GiB)": 21.51, "elapsed_time": "4h 35m 41s", "remaining_time": "7h 1m 9s", "loss_scale": 1.0, "consumed_samples": 644352, "global_step/max_steps": "2517/6362"} +{"lm loss": 4.958776, "grad_norm": 0.70581138, "learning_rate": 7.16e-05, "elapsed_time_per_iteration": 6.49342942, "memory(GiB)": 21.51, "elapsed_time": "4h 35m 48s", "remaining_time": "7h 1m 2s", "loss_scale": 1.0, "consumed_samples": 644608, "global_step/max_steps": "2518/6362"} +{"lm loss": 4.9728837, "grad_norm": 0.6189118, "learning_rate": 7.158e-05, "elapsed_time_per_iteration": 6.57795525, "memory(GiB)": 21.51, "elapsed_time": "4h 35m 54s", "remaining_time": "7h 0m 56s", "loss_scale": 1.0, "consumed_samples": 644864, "global_step/max_steps": "2519/6362"} +{"lm loss": 4.97104549, "grad_norm": 0.60445374, "learning_rate": 7.155e-05, "elapsed_time_per_iteration": 6.52346039, "memory(GiB)": 21.51, "elapsed_time": "4h 36m 1s", "remaining_time": "7h 0m 49s", "loss_scale": 1.0, "consumed_samples": 645120, "global_step/max_steps": "2520/6362"} +{"lm loss": 4.97703791, "grad_norm": 0.75733316, "learning_rate": 7.153e-05, "elapsed_time_per_iteration": 6.73668623, "memory(GiB)": 21.51, "elapsed_time": "4h 36m 8s", "remaining_time": "7h 0m 43s", "loss_scale": 1.0, "consumed_samples": 645376, "global_step/max_steps": "2521/6362"} +{"lm loss": 4.98576546, "grad_norm": 0.86850774, "learning_rate": 7.151e-05, "elapsed_time_per_iteration": 6.61064053, "memory(GiB)": 21.51, "elapsed_time": "4h 36m 14s", "remaining_time": "7h 0m 36s", "loss_scale": 1.0, "consumed_samples": 645632, "global_step/max_steps": "2522/6362"} +{"lm loss": 4.98529863, "grad_norm": 0.61415362, "learning_rate": 7.148e-05, "elapsed_time_per_iteration": 6.51866817, "memory(GiB)": 21.51, "elapsed_time": "4h 36m 21s", "remaining_time": "7h 0m 30s", "loss_scale": 1.0, "consumed_samples": 645888, "global_step/max_steps": "2523/6362"} +{"lm loss": 4.97675371, "grad_norm": 0.50766408, "learning_rate": 7.146e-05, "elapsed_time_per_iteration": 6.60173035, "memory(GiB)": 21.51, "elapsed_time": "4h 36m 27s", "remaining_time": "7h 0m 23s", "loss_scale": 1.0, "consumed_samples": 646144, "global_step/max_steps": "2524/6362"} +{"lm loss": 4.96559143, "grad_norm": 0.67199898, "learning_rate": 7.144e-05, "elapsed_time_per_iteration": 6.78673911, "memory(GiB)": 21.51, "elapsed_time": "4h 36m 34s", "remaining_time": "7h 0m 17s", "loss_scale": 1.0, "consumed_samples": 646400, "global_step/max_steps": "2525/6362"} +{"lm loss": 4.98432875, "grad_norm": 0.75035387, "learning_rate": 7.142e-05, "elapsed_time_per_iteration": 6.72786164, "memory(GiB)": 21.51, "elapsed_time": "4h 36m 41s", "remaining_time": "7h 0m 11s", "loss_scale": 1.0, "consumed_samples": 646656, "global_step/max_steps": "2526/6362"} +{"lm loss": 5.00039005, "grad_norm": 0.68510395, "learning_rate": 7.139e-05, "elapsed_time_per_iteration": 6.60285378, "memory(GiB)": 21.51, "elapsed_time": "4h 36m 48s", "remaining_time": "7h 0m 4s", "loss_scale": 1.0, "consumed_samples": 646912, "global_step/max_steps": "2527/6362"} +{"lm loss": 4.98744249, "grad_norm": 0.55168217, "learning_rate": 7.137e-05, "elapsed_time_per_iteration": 6.58705044, "memory(GiB)": 21.51, "elapsed_time": "4h 36m 54s", "remaining_time": "6h 59m 58s", "loss_scale": 1.0, "consumed_samples": 647168, "global_step/max_steps": "2528/6362"} +{"lm loss": 4.9702158, "grad_norm": 0.67995495, "learning_rate": 7.135e-05, "elapsed_time_per_iteration": 6.83868003, "memory(GiB)": 21.51, "elapsed_time": "4h 37m 1s", "remaining_time": "6h 59m 51s", "loss_scale": 1.0, "consumed_samples": 647424, "global_step/max_steps": "2529/6362"} +{"lm loss": 4.95942259, "grad_norm": 0.62895572, "learning_rate": 7.132e-05, "elapsed_time_per_iteration": 6.5810833, "memory(GiB)": 21.51, "elapsed_time": "4h 37m 8s", "remaining_time": "6h 59m 45s", "loss_scale": 1.0, "consumed_samples": 647680, "global_step/max_steps": "2530/6362"} +{"lm loss": 4.99281549, "grad_norm": 0.58473533, "learning_rate": 7.13e-05, "elapsed_time_per_iteration": 6.62422371, "memory(GiB)": 21.51, "elapsed_time": "4h 37m 14s", "remaining_time": "6h 59m 38s", "loss_scale": 1.0, "consumed_samples": 647936, "global_step/max_steps": "2531/6362"} +{"lm loss": 4.98853016, "grad_norm": 0.69266772, "learning_rate": 7.128e-05, "elapsed_time_per_iteration": 6.59882665, "memory(GiB)": 21.51, "elapsed_time": "4h 37m 21s", "remaining_time": "6h 59m 32s", "loss_scale": 1.0, "consumed_samples": 648192, "global_step/max_steps": "2532/6362"} +{"lm loss": 4.99141884, "grad_norm": 0.66460657, "learning_rate": 7.125e-05, "elapsed_time_per_iteration": 6.68325114, "memory(GiB)": 21.51, "elapsed_time": "4h 37m 28s", "remaining_time": "6h 59m 25s", "loss_scale": 1.0, "consumed_samples": 648448, "global_step/max_steps": "2533/6362"} +{"lm loss": 4.97792959, "grad_norm": 0.61870897, "learning_rate": 7.123e-05, "elapsed_time_per_iteration": 6.65935946, "memory(GiB)": 21.51, "elapsed_time": "4h 37m 34s", "remaining_time": "6h 59m 19s", "loss_scale": 1.0, "consumed_samples": 648704, "global_step/max_steps": "2534/6362"} +{"lm loss": 5.00350904, "grad_norm": 0.70387334, "learning_rate": 7.121e-05, "elapsed_time_per_iteration": 6.54686832, "memory(GiB)": 21.51, "elapsed_time": "4h 37m 41s", "remaining_time": "6h 59m 12s", "loss_scale": 1.0, "consumed_samples": 648960, "global_step/max_steps": "2535/6362"} +{"lm loss": 4.94957972, "grad_norm": 0.6863516, "learning_rate": 7.119e-05, "elapsed_time_per_iteration": 6.63898897, "memory(GiB)": 21.51, "elapsed_time": "4h 37m 47s", "remaining_time": "6h 59m 6s", "loss_scale": 1.0, "consumed_samples": 649216, "global_step/max_steps": "2536/6362"} +{"lm loss": 4.9829545, "grad_norm": 0.65560323, "learning_rate": 7.116e-05, "elapsed_time_per_iteration": 6.53455949, "memory(GiB)": 21.51, "elapsed_time": "4h 37m 54s", "remaining_time": "6h 58m 59s", "loss_scale": 1.0, "consumed_samples": 649472, "global_step/max_steps": "2537/6362"} +{"lm loss": 4.97553205, "grad_norm": 0.59753734, "learning_rate": 7.114e-05, "elapsed_time_per_iteration": 6.56154919, "memory(GiB)": 21.51, "elapsed_time": "4h 38m 0s", "remaining_time": "6h 58m 53s", "loss_scale": 1.0, "consumed_samples": 649728, "global_step/max_steps": "2538/6362"} +{"lm loss": 4.97209454, "grad_norm": 0.56061941, "learning_rate": 7.112e-05, "elapsed_time_per_iteration": 6.74990225, "memory(GiB)": 21.51, "elapsed_time": "4h 38m 7s", "remaining_time": "6h 58m 46s", "loss_scale": 1.0, "consumed_samples": 649984, "global_step/max_steps": "2539/6362"} +{"lm loss": 4.98911953, "grad_norm": 0.57638812, "learning_rate": 7.109e-05, "elapsed_time_per_iteration": 6.67353392, "memory(GiB)": 21.51, "elapsed_time": "4h 38m 14s", "remaining_time": "6h 58m 40s", "loss_scale": 1.0, "consumed_samples": 650240, "global_step/max_steps": "2540/6362"} +{"lm loss": 4.94935799, "grad_norm": 0.54325199, "learning_rate": 7.107e-05, "elapsed_time_per_iteration": 6.60296226, "memory(GiB)": 21.51, "elapsed_time": "4h 38m 20s", "remaining_time": "6h 58m 33s", "loss_scale": 1.0, "consumed_samples": 650496, "global_step/max_steps": "2541/6362"} +{"lm loss": 4.9731822, "grad_norm": 0.56101066, "learning_rate": 7.105e-05, "elapsed_time_per_iteration": 6.59018874, "memory(GiB)": 21.51, "elapsed_time": "4h 38m 27s", "remaining_time": "6h 58m 27s", "loss_scale": 1.0, "consumed_samples": 650752, "global_step/max_steps": "2542/6362"} +{"lm loss": 4.95761585, "grad_norm": 0.70130396, "learning_rate": 7.102e-05, "elapsed_time_per_iteration": 6.66183066, "memory(GiB)": 21.51, "elapsed_time": "4h 38m 34s", "remaining_time": "6h 58m 20s", "loss_scale": 1.0, "consumed_samples": 651008, "global_step/max_steps": "2543/6362"} +{"lm loss": 4.96809483, "grad_norm": 0.64423132, "learning_rate": 7.1e-05, "elapsed_time_per_iteration": 6.59798336, "memory(GiB)": 21.51, "elapsed_time": "4h 38m 40s", "remaining_time": "6h 58m 14s", "loss_scale": 1.0, "consumed_samples": 651264, "global_step/max_steps": "2544/6362"} +{"lm loss": 4.99063301, "grad_norm": 0.5250591, "learning_rate": 7.098e-05, "elapsed_time_per_iteration": 6.53448057, "memory(GiB)": 21.51, "elapsed_time": "4h 38m 47s", "remaining_time": "6h 58m 7s", "loss_scale": 1.0, "consumed_samples": 651520, "global_step/max_steps": "2545/6362"} +{"lm loss": 4.97003698, "grad_norm": 0.55697376, "learning_rate": 7.095e-05, "elapsed_time_per_iteration": 6.63661432, "memory(GiB)": 21.51, "elapsed_time": "4h 38m 53s", "remaining_time": "6h 58m 1s", "loss_scale": 1.0, "consumed_samples": 651776, "global_step/max_steps": "2546/6362"} +{"lm loss": 4.97403908, "grad_norm": 0.60209662, "learning_rate": 7.093e-05, "elapsed_time_per_iteration": 6.51848745, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 0s", "remaining_time": "6h 57m 54s", "loss_scale": 1.0, "consumed_samples": 652032, "global_step/max_steps": "2547/6362"} +{"lm loss": 4.94713593, "grad_norm": 0.64614576, "learning_rate": 7.091e-05, "elapsed_time_per_iteration": 6.5039475, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 7s", "remaining_time": "6h 57m 47s", "loss_scale": 1.0, "consumed_samples": 652288, "global_step/max_steps": "2548/6362"} +{"lm loss": 4.99078035, "grad_norm": 0.64855552, "learning_rate": 7.089e-05, "elapsed_time_per_iteration": 6.76430416, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 13s", "remaining_time": "6h 57m 41s", "loss_scale": 1.0, "consumed_samples": 652544, "global_step/max_steps": "2549/6362"} +{"lm loss": 4.95908022, "grad_norm": 0.56013668, "learning_rate": 7.086e-05, "elapsed_time_per_iteration": 6.69716573, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 20s", "remaining_time": "6h 57m 35s", "loss_scale": 1.0, "consumed_samples": 652800, "global_step/max_steps": "2550/6362"} +{"lm loss": 4.99721575, "grad_norm": 0.60166281, "learning_rate": 7.084e-05, "elapsed_time_per_iteration": 6.44323969, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 26s", "remaining_time": "6h 57m 28s", "loss_scale": 1.0, "consumed_samples": 653056, "global_step/max_steps": "2551/6362"} +{"lm loss": 4.97883129, "grad_norm": 0.62619263, "learning_rate": 7.082e-05, "elapsed_time_per_iteration": 6.63219595, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 33s", "remaining_time": "6h 57m 22s", "loss_scale": 1.0, "consumed_samples": 653312, "global_step/max_steps": "2552/6362"} +{"lm loss": 4.97284889, "grad_norm": 0.72279561, "learning_rate": 7.079e-05, "elapsed_time_per_iteration": 6.58359814, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 40s", "remaining_time": "6h 57m 15s", "loss_scale": 1.0, "consumed_samples": 653568, "global_step/max_steps": "2553/6362"} +{"lm loss": 4.98627138, "grad_norm": 0.70008671, "learning_rate": 7.077e-05, "elapsed_time_per_iteration": 6.62462783, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 46s", "remaining_time": "6h 57m 8s", "loss_scale": 1.0, "consumed_samples": 653824, "global_step/max_steps": "2554/6362"} +{"lm loss": 4.9663825, "grad_norm": 0.75550413, "learning_rate": 7.075e-05, "elapsed_time_per_iteration": 6.44579291, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 53s", "remaining_time": "6h 57m 2s", "loss_scale": 1.0, "consumed_samples": 654080, "global_step/max_steps": "2555/6362"} +{"lm loss": 4.98879576, "grad_norm": 0.64877844, "learning_rate": 7.072e-05, "elapsed_time_per_iteration": 6.42850304, "memory(GiB)": 21.51, "elapsed_time": "4h 39m 59s", "remaining_time": "6h 56m 55s", "loss_scale": 1.0, "consumed_samples": 654336, "global_step/max_steps": "2556/6362"} +{"lm loss": 4.98790407, "grad_norm": 0.48571694, "learning_rate": 7.07e-05, "elapsed_time_per_iteration": 6.53639412, "memory(GiB)": 21.51, "elapsed_time": "4h 40m 6s", "remaining_time": "6h 56m 48s", "loss_scale": 1.0, "consumed_samples": 654592, "global_step/max_steps": "2557/6362"} +{"lm loss": 4.97559261, "grad_norm": 0.61413914, "learning_rate": 7.068e-05, "elapsed_time_per_iteration": 6.5888176, "memory(GiB)": 21.51, "elapsed_time": "4h 40m 12s", "remaining_time": "6h 56m 42s", "loss_scale": 1.0, "consumed_samples": 654848, "global_step/max_steps": "2558/6362"} +{"lm loss": 4.95195293, "grad_norm": 0.58732724, "learning_rate": 7.065e-05, "elapsed_time_per_iteration": 6.5650959, "memory(GiB)": 21.51, "elapsed_time": "4h 40m 19s", "remaining_time": "6h 56m 35s", "loss_scale": 1.0, "consumed_samples": 655104, "global_step/max_steps": "2559/6362"} +{"lm loss": 4.97351885, "grad_norm": 0.64234281, "learning_rate": 7.063e-05, "elapsed_time_per_iteration": 6.45306134, "memory(GiB)": 21.51, "elapsed_time": "4h 40m 25s", "remaining_time": "6h 56m 28s", "loss_scale": 1.0, "consumed_samples": 655360, "global_step/max_steps": "2560/6362"} +{"lm loss": 4.98717356, "grad_norm": 0.60873413, "learning_rate": 7.061e-05, "elapsed_time_per_iteration": 6.44877243, "memory(GiB)": 21.51, "elapsed_time": "4h 40m 32s", "remaining_time": "6h 56m 22s", "loss_scale": 1.0, "consumed_samples": 655616, "global_step/max_steps": "2561/6362"} +{"lm loss": 4.96674204, "grad_norm": 0.54041559, "learning_rate": 7.058e-05, "elapsed_time_per_iteration": 6.77659202, "memory(GiB)": 21.51, "elapsed_time": "4h 40m 39s", "remaining_time": "6h 56m 15s", "loss_scale": 1.0, "consumed_samples": 655872, "global_step/max_steps": "2562/6362"} +{"lm loss": 4.94646311, "grad_norm": 0.56407946, "learning_rate": 7.056e-05, "elapsed_time_per_iteration": 6.62557912, "memory(GiB)": 21.51, "elapsed_time": "4h 40m 45s", "remaining_time": "6h 56m 9s", "loss_scale": 1.0, "consumed_samples": 656128, "global_step/max_steps": "2563/6362"} +{"lm loss": 4.97372437, "grad_norm": 0.49966279, "learning_rate": 7.054e-05, "elapsed_time_per_iteration": 6.57650304, "memory(GiB)": 21.51, "elapsed_time": "4h 40m 52s", "remaining_time": "6h 56m 2s", "loss_scale": 1.0, "consumed_samples": 656384, "global_step/max_steps": "2564/6362"} +{"lm loss": 4.9631238, "grad_norm": 0.60411406, "learning_rate": 7.051e-05, "elapsed_time_per_iteration": 6.40360093, "memory(GiB)": 21.51, "elapsed_time": "4h 40m 58s", "remaining_time": "6h 55m 55s", "loss_scale": 1.0, "consumed_samples": 656640, "global_step/max_steps": "2565/6362"} +{"lm loss": 4.97741079, "grad_norm": 0.63505977, "learning_rate": 7.049e-05, "elapsed_time_per_iteration": 6.87382841, "memory(GiB)": 21.51, "elapsed_time": "4h 41m 5s", "remaining_time": "6h 55m 49s", "loss_scale": 1.0, "consumed_samples": 656896, "global_step/max_steps": "2566/6362"} +{"lm loss": 4.98215675, "grad_norm": 0.62302375, "learning_rate": 7.047e-05, "elapsed_time_per_iteration": 6.70867276, "memory(GiB)": 21.51, "elapsed_time": "4h 41m 12s", "remaining_time": "6h 55m 43s", "loss_scale": 1.0, "consumed_samples": 657152, "global_step/max_steps": "2567/6362"} +{"lm loss": 4.95947456, "grad_norm": 0.70714927, "learning_rate": 7.045e-05, "elapsed_time_per_iteration": 6.65091228, "memory(GiB)": 21.51, "elapsed_time": "4h 41m 18s", "remaining_time": "6h 55m 37s", "loss_scale": 1.0, "consumed_samples": 657408, "global_step/max_steps": "2568/6362"} +{"lm loss": 4.96793699, "grad_norm": 0.6431759, "learning_rate": 7.042e-05, "elapsed_time_per_iteration": 6.44480944, "memory(GiB)": 21.51, "elapsed_time": "4h 41m 25s", "remaining_time": "6h 55m 30s", "loss_scale": 1.0, "consumed_samples": 657664, "global_step/max_steps": "2569/6362"} +{"lm loss": 4.95854998, "grad_norm": 0.6415813, "learning_rate": 7.04e-05, "elapsed_time_per_iteration": 6.4944253, "memory(GiB)": 21.51, "elapsed_time": "4h 41m 31s", "remaining_time": "6h 55m 23s", "loss_scale": 1.0, "consumed_samples": 657920, "global_step/max_steps": "2570/6362"} +{"lm loss": 4.97810793, "grad_norm": 0.69430274, "learning_rate": 7.038e-05, "elapsed_time_per_iteration": 6.45063305, "memory(GiB)": 21.51, "elapsed_time": "4h 41m 38s", "remaining_time": "6h 55m 16s", "loss_scale": 1.0, "consumed_samples": 658176, "global_step/max_steps": "2571/6362"} +{"lm loss": 4.98442745, "grad_norm": 0.63142383, "learning_rate": 7.035e-05, "elapsed_time_per_iteration": 6.58141851, "memory(GiB)": 21.51, "elapsed_time": "4h 41m 44s", "remaining_time": "6h 55m 10s", "loss_scale": 1.0, "consumed_samples": 658432, "global_step/max_steps": "2572/6362"} +{"lm loss": 4.99064159, "grad_norm": 0.57118899, "learning_rate": 7.033e-05, "elapsed_time_per_iteration": 6.55584574, "memory(GiB)": 21.51, "elapsed_time": "4h 41m 51s", "remaining_time": "6h 55m 3s", "loss_scale": 1.0, "consumed_samples": 658688, "global_step/max_steps": "2573/6362"} +{"lm loss": 4.91726065, "grad_norm": 0.56469095, "learning_rate": 7.031e-05, "elapsed_time_per_iteration": 6.69741631, "memory(GiB)": 21.51, "elapsed_time": "4h 41m 58s", "remaining_time": "6h 54m 57s", "loss_scale": 1.0, "consumed_samples": 658944, "global_step/max_steps": "2574/6362"} +{"lm loss": 4.97966576, "grad_norm": 0.54323417, "learning_rate": 7.028e-05, "elapsed_time_per_iteration": 6.7243669, "memory(GiB)": 21.51, "elapsed_time": "4h 42m 4s", "remaining_time": "6h 54m 50s", "loss_scale": 1.0, "consumed_samples": 659200, "global_step/max_steps": "2575/6362"} +{"lm loss": 4.95239735, "grad_norm": 0.66136324, "learning_rate": 7.026e-05, "elapsed_time_per_iteration": 6.62131238, "memory(GiB)": 21.51, "elapsed_time": "4h 42m 11s", "remaining_time": "6h 54m 44s", "loss_scale": 1.0, "consumed_samples": 659456, "global_step/max_steps": "2576/6362"} +{"lm loss": 4.98929119, "grad_norm": 0.63965505, "learning_rate": 7.024e-05, "elapsed_time_per_iteration": 6.47106314, "memory(GiB)": 21.51, "elapsed_time": "4h 42m 17s", "remaining_time": "6h 54m 37s", "loss_scale": 1.0, "consumed_samples": 659712, "global_step/max_steps": "2577/6362"} +{"lm loss": 4.95967865, "grad_norm": 0.53751343, "learning_rate": 7.021e-05, "elapsed_time_per_iteration": 6.48134112, "memory(GiB)": 21.51, "elapsed_time": "4h 42m 24s", "remaining_time": "6h 54m 31s", "loss_scale": 1.0, "consumed_samples": 659968, "global_step/max_steps": "2578/6362"} +{"lm loss": 4.95576859, "grad_norm": 0.5894953, "learning_rate": 7.019e-05, "elapsed_time_per_iteration": 6.69981217, "memory(GiB)": 21.51, "elapsed_time": "4h 42m 31s", "remaining_time": "6h 54m 24s", "loss_scale": 1.0, "consumed_samples": 660224, "global_step/max_steps": "2579/6362"} +{"lm loss": 4.94989777, "grad_norm": 0.61734098, "learning_rate": 7.017e-05, "elapsed_time_per_iteration": 6.56646085, "memory(GiB)": 21.51, "elapsed_time": "4h 42m 37s", "remaining_time": "6h 54m 18s", "loss_scale": 1.0, "consumed_samples": 660480, "global_step/max_steps": "2580/6362"} +{"lm loss": 4.97153187, "grad_norm": 0.55762243, "learning_rate": 7.014e-05, "elapsed_time_per_iteration": 6.64857459, "memory(GiB)": 21.51, "elapsed_time": "4h 42m 44s", "remaining_time": "6h 54m 11s", "loss_scale": 1.0, "consumed_samples": 660736, "global_step/max_steps": "2581/6362"} +{"lm loss": 4.96415424, "grad_norm": 0.57512456, "learning_rate": 7.012e-05, "elapsed_time_per_iteration": 6.67431521, "memory(GiB)": 21.51, "elapsed_time": "4h 42m 50s", "remaining_time": "6h 54m 5s", "loss_scale": 1.0, "consumed_samples": 660992, "global_step/max_steps": "2582/6362"} +{"lm loss": 4.97665024, "grad_norm": 0.51267397, "learning_rate": 7.01e-05, "elapsed_time_per_iteration": 6.58407259, "memory(GiB)": 21.51, "elapsed_time": "4h 42m 57s", "remaining_time": "6h 53m 58s", "loss_scale": 1.0, "consumed_samples": 661248, "global_step/max_steps": "2583/6362"} +{"lm loss": 4.98072147, "grad_norm": 0.6395393, "learning_rate": 7.007e-05, "elapsed_time_per_iteration": 6.5899837, "memory(GiB)": 21.51, "elapsed_time": "4h 43m 4s", "remaining_time": "6h 53m 52s", "loss_scale": 1.0, "consumed_samples": 661504, "global_step/max_steps": "2584/6362"} +{"lm loss": 4.95422363, "grad_norm": 0.79498392, "learning_rate": 7.005e-05, "elapsed_time_per_iteration": 6.46846771, "memory(GiB)": 21.51, "elapsed_time": "4h 43m 10s", "remaining_time": "6h 53m 45s", "loss_scale": 1.0, "consumed_samples": 661760, "global_step/max_steps": "2585/6362"} +{"lm loss": 4.99116516, "grad_norm": 0.6409632, "learning_rate": 7.003e-05, "elapsed_time_per_iteration": 6.42749023, "memory(GiB)": 21.51, "elapsed_time": "4h 43m 17s", "remaining_time": "6h 53m 38s", "loss_scale": 1.0, "consumed_samples": 662016, "global_step/max_steps": "2586/6362"} +{"lm loss": 4.97063875, "grad_norm": 0.6211198, "learning_rate": 7e-05, "elapsed_time_per_iteration": 6.60196137, "memory(GiB)": 21.51, "elapsed_time": "4h 43m 23s", "remaining_time": "6h 53m 32s", "loss_scale": 1.0, "consumed_samples": 662272, "global_step/max_steps": "2587/6362"} +{"lm loss": 4.96057224, "grad_norm": 0.57225716, "learning_rate": 6.998e-05, "elapsed_time_per_iteration": 6.60609984, "memory(GiB)": 21.51, "elapsed_time": "4h 43m 30s", "remaining_time": "6h 53m 25s", "loss_scale": 1.0, "consumed_samples": 662528, "global_step/max_steps": "2588/6362"} +{"lm loss": 4.98174524, "grad_norm": 0.52012527, "learning_rate": 6.996e-05, "elapsed_time_per_iteration": 6.49570251, "memory(GiB)": 21.51, "elapsed_time": "4h 43m 36s", "remaining_time": "6h 53m 18s", "loss_scale": 1.0, "consumed_samples": 662784, "global_step/max_steps": "2589/6362"} +{"lm loss": 4.99186754, "grad_norm": 0.6945594, "learning_rate": 6.993e-05, "elapsed_time_per_iteration": 6.46465945, "memory(GiB)": 21.51, "elapsed_time": "4h 43m 43s", "remaining_time": "6h 53m 12s", "loss_scale": 1.0, "consumed_samples": 663040, "global_step/max_steps": "2590/6362"} +{"lm loss": 4.96323872, "grad_norm": 0.69279742, "learning_rate": 6.991e-05, "elapsed_time_per_iteration": 6.58222914, "memory(GiB)": 21.51, "elapsed_time": "4h 43m 49s", "remaining_time": "6h 53m 5s", "loss_scale": 1.0, "consumed_samples": 663296, "global_step/max_steps": "2591/6362"} +{"lm loss": 4.97099495, "grad_norm": 0.62529826, "learning_rate": 6.989e-05, "elapsed_time_per_iteration": 6.6116116, "memory(GiB)": 21.51, "elapsed_time": "4h 43m 56s", "remaining_time": "6h 52m 59s", "loss_scale": 1.0, "consumed_samples": 663552, "global_step/max_steps": "2592/6362"} +{"lm loss": 4.97522926, "grad_norm": 0.56303471, "learning_rate": 6.986e-05, "elapsed_time_per_iteration": 6.66845608, "memory(GiB)": 21.51, "elapsed_time": "4h 44m 3s", "remaining_time": "6h 52m 52s", "loss_scale": 1.0, "consumed_samples": 663808, "global_step/max_steps": "2593/6362"} +{"lm loss": 4.96433735, "grad_norm": 0.6292066, "learning_rate": 6.984e-05, "elapsed_time_per_iteration": 6.46968174, "memory(GiB)": 21.51, "elapsed_time": "4h 44m 9s", "remaining_time": "6h 52m 45s", "loss_scale": 1.0, "consumed_samples": 664064, "global_step/max_steps": "2594/6362"} +{"lm loss": 4.9449544, "grad_norm": 0.70095015, "learning_rate": 6.982e-05, "elapsed_time_per_iteration": 6.58828807, "memory(GiB)": 21.51, "elapsed_time": "4h 44m 16s", "remaining_time": "6h 52m 39s", "loss_scale": 1.0, "consumed_samples": 664320, "global_step/max_steps": "2595/6362"} +{"lm loss": 4.97716951, "grad_norm": 0.54112595, "learning_rate": 6.979e-05, "elapsed_time_per_iteration": 6.50103283, "memory(GiB)": 21.51, "elapsed_time": "4h 44m 22s", "remaining_time": "6h 52m 32s", "loss_scale": 1.0, "consumed_samples": 664576, "global_step/max_steps": "2596/6362"} +{"lm loss": 4.9747839, "grad_norm": 0.57965153, "learning_rate": 6.977e-05, "elapsed_time_per_iteration": 6.69133663, "memory(GiB)": 21.51, "elapsed_time": "4h 44m 29s", "remaining_time": "6h 52m 26s", "loss_scale": 1.0, "consumed_samples": 664832, "global_step/max_steps": "2597/6362"} +{"lm loss": 4.97991514, "grad_norm": 0.56851155, "learning_rate": 6.975e-05, "elapsed_time_per_iteration": 6.46160579, "memory(GiB)": 21.51, "elapsed_time": "4h 44m 35s", "remaining_time": "6h 52m 19s", "loss_scale": 1.0, "consumed_samples": 665088, "global_step/max_steps": "2598/6362"} +{"lm loss": 4.97677946, "grad_norm": 0.5823524, "learning_rate": 6.972e-05, "elapsed_time_per_iteration": 6.77370644, "memory(GiB)": 21.51, "elapsed_time": "4h 44m 42s", "remaining_time": "6h 52m 13s", "loss_scale": 1.0, "consumed_samples": 665344, "global_step/max_steps": "2599/6362"} +{"lm loss": 4.99478626, "grad_norm": 0.53422725, "learning_rate": 6.97e-05, "elapsed_time_per_iteration": 6.74243188, "memory(GiB)": 21.51, "elapsed_time": "4h 44m 49s", "remaining_time": "6h 52m 6s", "loss_scale": 1.0, "consumed_samples": 665600, "global_step/max_steps": "2600/6362"} +{"lm loss": 4.96074247, "grad_norm": 0.55217981, "learning_rate": 6.968e-05, "elapsed_time_per_iteration": 6.64852524, "memory(GiB)": 21.51, "elapsed_time": "4h 44m 55s", "remaining_time": "6h 52m 0s", "loss_scale": 1.0, "consumed_samples": 665856, "global_step/max_steps": "2601/6362"} +{"lm loss": 4.98203659, "grad_norm": 0.59952325, "learning_rate": 6.965e-05, "elapsed_time_per_iteration": 6.54613519, "memory(GiB)": 21.51, "elapsed_time": "4h 45m 2s", "remaining_time": "6h 51m 53s", "loss_scale": 1.0, "consumed_samples": 666112, "global_step/max_steps": "2602/6362"} +{"lm loss": 4.97730064, "grad_norm": 0.57413244, "learning_rate": 6.963e-05, "elapsed_time_per_iteration": 6.69709325, "memory(GiB)": 21.51, "elapsed_time": "4h 45m 9s", "remaining_time": "6h 51m 47s", "loss_scale": 1.0, "consumed_samples": 666368, "global_step/max_steps": "2603/6362"} +{"lm loss": 4.97675276, "grad_norm": 0.68379903, "learning_rate": 6.961e-05, "elapsed_time_per_iteration": 6.6792469, "memory(GiB)": 21.51, "elapsed_time": "4h 45m 15s", "remaining_time": "6h 51m 40s", "loss_scale": 1.0, "consumed_samples": 666624, "global_step/max_steps": "2604/6362"} +{"lm loss": 4.96611881, "grad_norm": 0.72811538, "learning_rate": 6.958e-05, "elapsed_time_per_iteration": 6.43208385, "memory(GiB)": 21.51, "elapsed_time": "4h 45m 22s", "remaining_time": "6h 51m 34s", "loss_scale": 1.0, "consumed_samples": 666880, "global_step/max_steps": "2605/6362"} +{"lm loss": 4.96414709, "grad_norm": 0.83197683, "learning_rate": 6.956e-05, "elapsed_time_per_iteration": 6.4413631, "memory(GiB)": 21.51, "elapsed_time": "4h 45m 28s", "remaining_time": "6h 51m 27s", "loss_scale": 1.0, "consumed_samples": 667136, "global_step/max_steps": "2606/6362"} +{"lm loss": 4.94950581, "grad_norm": 0.9502787, "learning_rate": 6.954e-05, "elapsed_time_per_iteration": 6.64529967, "memory(GiB)": 21.51, "elapsed_time": "4h 45m 35s", "remaining_time": "6h 51m 20s", "loss_scale": 1.0, "consumed_samples": 667392, "global_step/max_steps": "2607/6362"} +{"lm loss": 4.97619963, "grad_norm": 0.72838521, "learning_rate": 6.951e-05, "elapsed_time_per_iteration": 6.51740623, "memory(GiB)": 21.51, "elapsed_time": "4h 45m 41s", "remaining_time": "6h 51m 14s", "loss_scale": 1.0, "consumed_samples": 667648, "global_step/max_steps": "2608/6362"} +{"lm loss": 4.95943403, "grad_norm": 0.54009843, "learning_rate": 6.949e-05, "elapsed_time_per_iteration": 6.50069571, "memory(GiB)": 21.51, "elapsed_time": "4h 45m 48s", "remaining_time": "6h 51m 7s", "loss_scale": 1.0, "consumed_samples": 667904, "global_step/max_steps": "2609/6362"} +{"lm loss": 4.97139597, "grad_norm": 0.65076393, "learning_rate": 6.947e-05, "elapsed_time_per_iteration": 6.57451844, "memory(GiB)": 21.51, "elapsed_time": "4h 45m 54s", "remaining_time": "6h 51m 1s", "loss_scale": 1.0, "consumed_samples": 668160, "global_step/max_steps": "2610/6362"} +{"lm loss": 4.98278284, "grad_norm": 0.70944458, "learning_rate": 6.944e-05, "elapsed_time_per_iteration": 6.6016643, "memory(GiB)": 21.51, "elapsed_time": "4h 46m 1s", "remaining_time": "6h 50m 54s", "loss_scale": 1.0, "consumed_samples": 668416, "global_step/max_steps": "2611/6362"} +{"lm loss": 4.95544767, "grad_norm": 0.59662455, "learning_rate": 6.942e-05, "elapsed_time_per_iteration": 6.54279065, "memory(GiB)": 21.51, "elapsed_time": "4h 46m 8s", "remaining_time": "6h 50m 47s", "loss_scale": 1.0, "consumed_samples": 668672, "global_step/max_steps": "2612/6362"} +{"lm loss": 4.99172783, "grad_norm": 0.5278036, "learning_rate": 6.94e-05, "elapsed_time_per_iteration": 6.66079664, "memory(GiB)": 21.51, "elapsed_time": "4h 46m 14s", "remaining_time": "6h 50m 41s", "loss_scale": 1.0, "consumed_samples": 668928, "global_step/max_steps": "2613/6362"} +{"lm loss": 4.97771454, "grad_norm": 0.50752282, "learning_rate": 6.937e-05, "elapsed_time_per_iteration": 6.49498844, "memory(GiB)": 21.51, "elapsed_time": "4h 46m 21s", "remaining_time": "6h 50m 34s", "loss_scale": 1.0, "consumed_samples": 669184, "global_step/max_steps": "2614/6362"} +{"lm loss": 4.95986128, "grad_norm": 0.60688531, "learning_rate": 6.935e-05, "elapsed_time_per_iteration": 6.37867498, "memory(GiB)": 21.51, "elapsed_time": "4h 46m 27s", "remaining_time": "6h 50m 27s", "loss_scale": 1.0, "consumed_samples": 669440, "global_step/max_steps": "2615/6362"} +{"lm loss": 4.9860177, "grad_norm": 0.5519526, "learning_rate": 6.933e-05, "elapsed_time_per_iteration": 6.55592227, "memory(GiB)": 21.51, "elapsed_time": "4h 46m 34s", "remaining_time": "6h 50m 21s", "loss_scale": 1.0, "consumed_samples": 669696, "global_step/max_steps": "2616/6362"} +{"lm loss": 4.94987917, "grad_norm": 0.48819682, "learning_rate": 6.93e-05, "elapsed_time_per_iteration": 6.51177931, "memory(GiB)": 21.51, "elapsed_time": "4h 46m 40s", "remaining_time": "6h 50m 14s", "loss_scale": 1.0, "consumed_samples": 669952, "global_step/max_steps": "2617/6362"} +{"lm loss": 4.98035383, "grad_norm": 0.55404878, "learning_rate": 6.928e-05, "elapsed_time_per_iteration": 6.74120831, "memory(GiB)": 21.51, "elapsed_time": "4h 46m 47s", "remaining_time": "6h 50m 8s", "loss_scale": 1.0, "consumed_samples": 670208, "global_step/max_steps": "2618/6362"} +{"lm loss": 4.98308897, "grad_norm": 0.55456942, "learning_rate": 6.926e-05, "elapsed_time_per_iteration": 6.56989694, "memory(GiB)": 21.51, "elapsed_time": "4h 46m 54s", "remaining_time": "6h 50m 1s", "loss_scale": 1.0, "consumed_samples": 670464, "global_step/max_steps": "2619/6362"} +{"lm loss": 4.98097229, "grad_norm": 0.53602904, "learning_rate": 6.923e-05, "elapsed_time_per_iteration": 6.50525832, "memory(GiB)": 21.51, "elapsed_time": "4h 47m 0s", "remaining_time": "6h 49m 55s", "loss_scale": 1.0, "consumed_samples": 670720, "global_step/max_steps": "2620/6362"} +{"lm loss": 4.95525742, "grad_norm": 0.53862071, "learning_rate": 6.921e-05, "elapsed_time_per_iteration": 6.62542868, "memory(GiB)": 21.51, "elapsed_time": "4h 47m 7s", "remaining_time": "6h 49m 48s", "loss_scale": 1.0, "consumed_samples": 670976, "global_step/max_steps": "2621/6362"} +{"lm loss": 4.96716452, "grad_norm": 0.55652857, "learning_rate": 6.918e-05, "elapsed_time_per_iteration": 6.74357343, "memory(GiB)": 21.51, "elapsed_time": "4h 47m 13s", "remaining_time": "6h 49m 42s", "loss_scale": 1.0, "consumed_samples": 671232, "global_step/max_steps": "2622/6362"} +{"lm loss": 4.98805332, "grad_norm": 0.61655915, "learning_rate": 6.916e-05, "elapsed_time_per_iteration": 6.4569118, "memory(GiB)": 21.51, "elapsed_time": "4h 47m 20s", "remaining_time": "6h 49m 35s", "loss_scale": 1.0, "consumed_samples": 671488, "global_step/max_steps": "2623/6362"} +{"lm loss": 4.96557903, "grad_norm": 0.72008985, "learning_rate": 6.914e-05, "elapsed_time_per_iteration": 6.66283917, "memory(GiB)": 21.51, "elapsed_time": "4h 47m 27s", "remaining_time": "6h 49m 29s", "loss_scale": 1.0, "consumed_samples": 671744, "global_step/max_steps": "2624/6362"} +{"lm loss": 4.96276903, "grad_norm": 0.70557559, "learning_rate": 6.911e-05, "elapsed_time_per_iteration": 6.69631243, "memory(GiB)": 21.51, "elapsed_time": "4h 47m 33s", "remaining_time": "6h 49m 22s", "loss_scale": 1.0, "consumed_samples": 672000, "global_step/max_steps": "2625/6362"} +{"lm loss": 4.96103573, "grad_norm": 0.53052717, "learning_rate": 6.909e-05, "elapsed_time_per_iteration": 6.45665956, "memory(GiB)": 21.51, "elapsed_time": "4h 47m 40s", "remaining_time": "6h 49m 15s", "loss_scale": 1.0, "consumed_samples": 672256, "global_step/max_steps": "2626/6362"} +{"lm loss": 4.97186565, "grad_norm": 0.47947589, "learning_rate": 6.907e-05, "elapsed_time_per_iteration": 6.83711267, "memory(GiB)": 21.51, "elapsed_time": "4h 47m 47s", "remaining_time": "6h 49m 9s", "loss_scale": 1.0, "consumed_samples": 672512, "global_step/max_steps": "2627/6362"} +{"lm loss": 4.96100521, "grad_norm": 0.5528487, "learning_rate": 6.904e-05, "elapsed_time_per_iteration": 6.66985846, "memory(GiB)": 21.51, "elapsed_time": "4h 47m 53s", "remaining_time": "6h 49m 3s", "loss_scale": 1.0, "consumed_samples": 672768, "global_step/max_steps": "2628/6362"} +{"lm loss": 4.95719051, "grad_norm": 0.60060549, "learning_rate": 6.902e-05, "elapsed_time_per_iteration": 6.68197942, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 0s", "remaining_time": "6h 48m 56s", "loss_scale": 1.0, "consumed_samples": 673024, "global_step/max_steps": "2629/6362"} +{"lm loss": 4.97731256, "grad_norm": 0.60231477, "learning_rate": 6.9e-05, "elapsed_time_per_iteration": 6.60934019, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 6s", "remaining_time": "6h 48m 50s", "loss_scale": 1.0, "consumed_samples": 673280, "global_step/max_steps": "2630/6362"} +{"lm loss": 4.9672327, "grad_norm": 0.50361633, "learning_rate": 6.897e-05, "elapsed_time_per_iteration": 6.71067357, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 13s", "remaining_time": "6h 48m 44s", "loss_scale": 1.0, "consumed_samples": 673536, "global_step/max_steps": "2631/6362"} +{"lm loss": 4.96522045, "grad_norm": 0.55397934, "learning_rate": 6.895e-05, "elapsed_time_per_iteration": 6.68957162, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 20s", "remaining_time": "6h 48m 37s", "loss_scale": 1.0, "consumed_samples": 673792, "global_step/max_steps": "2632/6362"} +{"lm loss": 4.99273205, "grad_norm": 0.58087838, "learning_rate": 6.893e-05, "elapsed_time_per_iteration": 6.5849421, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 26s", "remaining_time": "6h 48m 31s", "loss_scale": 1.0, "consumed_samples": 674048, "global_step/max_steps": "2633/6362"} +{"lm loss": 4.9805975, "grad_norm": 0.54421914, "learning_rate": 6.89e-05, "elapsed_time_per_iteration": 6.55615807, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 33s", "remaining_time": "6h 48m 24s", "loss_scale": 1.0, "consumed_samples": 674304, "global_step/max_steps": "2634/6362"} +{"lm loss": 4.98306417, "grad_norm": 0.54737431, "learning_rate": 6.888e-05, "elapsed_time_per_iteration": 6.43263364, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 39s", "remaining_time": "6h 48m 17s", "loss_scale": 1.0, "consumed_samples": 674560, "global_step/max_steps": "2635/6362"} +{"lm loss": 4.98295403, "grad_norm": 0.59124327, "learning_rate": 6.886e-05, "elapsed_time_per_iteration": 6.60946178, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 46s", "remaining_time": "6h 48m 11s", "loss_scale": 1.0, "consumed_samples": 674816, "global_step/max_steps": "2636/6362"} +{"lm loss": 4.95874166, "grad_norm": 0.62918919, "learning_rate": 6.883e-05, "elapsed_time_per_iteration": 6.77650142, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 53s", "remaining_time": "6h 48m 4s", "loss_scale": 1.0, "consumed_samples": 675072, "global_step/max_steps": "2637/6362"} +{"lm loss": 4.95917606, "grad_norm": 0.61125058, "learning_rate": 6.881e-05, "elapsed_time_per_iteration": 6.60119557, "memory(GiB)": 21.51, "elapsed_time": "4h 48m 59s", "remaining_time": "6h 47m 58s", "loss_scale": 1.0, "consumed_samples": 675328, "global_step/max_steps": "2638/6362"} +{"lm loss": 4.97127295, "grad_norm": 0.64162803, "learning_rate": 6.878e-05, "elapsed_time_per_iteration": 7.38962913, "memory(GiB)": 21.51, "elapsed_time": "4h 49m 7s", "remaining_time": "6h 47m 52s", "loss_scale": 1.0, "consumed_samples": 675584, "global_step/max_steps": "2639/6362"} +{"lm loss": 4.97816801, "grad_norm": 0.65032464, "learning_rate": 6.876e-05, "elapsed_time_per_iteration": 6.68483043, "memory(GiB)": 21.51, "elapsed_time": "4h 49m 14s", "remaining_time": "6h 47m 46s", "loss_scale": 1.0, "consumed_samples": 675840, "global_step/max_steps": "2640/6362"} +{"lm loss": 4.97763109, "grad_norm": 0.61932176, "learning_rate": 6.874e-05, "elapsed_time_per_iteration": 6.46574807, "memory(GiB)": 21.51, "elapsed_time": "4h 49m 20s", "remaining_time": "6h 47m 39s", "loss_scale": 1.0, "consumed_samples": 676096, "global_step/max_steps": "2641/6362"} +{"lm loss": 4.97823048, "grad_norm": 0.67310005, "learning_rate": 6.871e-05, "elapsed_time_per_iteration": 6.68834257, "memory(GiB)": 21.51, "elapsed_time": "4h 49m 27s", "remaining_time": "6h 47m 33s", "loss_scale": 1.0, "consumed_samples": 676352, "global_step/max_steps": "2642/6362"} +{"lm loss": 4.93481588, "grad_norm": 0.72042131, "learning_rate": 6.869e-05, "elapsed_time_per_iteration": 7.52450943, "memory(GiB)": 21.51, "elapsed_time": "4h 49m 34s", "remaining_time": "6h 47m 28s", "loss_scale": 1.0, "consumed_samples": 676608, "global_step/max_steps": "2643/6362"} +{"lm loss": 4.94918871, "grad_norm": 0.74861503, "learning_rate": 6.867e-05, "elapsed_time_per_iteration": 6.64458895, "memory(GiB)": 21.51, "elapsed_time": "4h 49m 41s", "remaining_time": "6h 47m 21s", "loss_scale": 1.0, "consumed_samples": 676864, "global_step/max_steps": "2644/6362"} +{"lm loss": 4.96180153, "grad_norm": 0.71000081, "learning_rate": 6.864e-05, "elapsed_time_per_iteration": 6.75614572, "memory(GiB)": 21.51, "elapsed_time": "4h 49m 48s", "remaining_time": "6h 47m 15s", "loss_scale": 1.0, "consumed_samples": 677120, "global_step/max_steps": "2645/6362"} +{"lm loss": 4.96478271, "grad_norm": 0.70703989, "learning_rate": 6.862e-05, "elapsed_time_per_iteration": 6.8930521, "memory(GiB)": 21.51, "elapsed_time": "4h 49m 54s", "remaining_time": "6h 47m 9s", "loss_scale": 1.0, "consumed_samples": 677376, "global_step/max_steps": "2646/6362"} +{"lm loss": 4.97823191, "grad_norm": 0.63076609, "learning_rate": 6.86e-05, "elapsed_time_per_iteration": 6.51339412, "memory(GiB)": 21.51, "elapsed_time": "4h 50m 1s", "remaining_time": "6h 47m 2s", "loss_scale": 1.0, "consumed_samples": 677632, "global_step/max_steps": "2647/6362"} +{"lm loss": 4.98073196, "grad_norm": 0.70369792, "learning_rate": 6.857e-05, "elapsed_time_per_iteration": 6.75354171, "memory(GiB)": 21.51, "elapsed_time": "4h 50m 8s", "remaining_time": "6h 46m 56s", "loss_scale": 1.0, "consumed_samples": 677888, "global_step/max_steps": "2648/6362"} +{"lm loss": 4.95689631, "grad_norm": 0.67531335, "learning_rate": 6.855e-05, "elapsed_time_per_iteration": 6.69381356, "memory(GiB)": 21.51, "elapsed_time": "4h 50m 14s", "remaining_time": "6h 46m 49s", "loss_scale": 1.0, "consumed_samples": 678144, "global_step/max_steps": "2649/6362"} +{"lm loss": 4.96050739, "grad_norm": 0.6006006, "learning_rate": 6.853e-05, "elapsed_time_per_iteration": 6.53393316, "memory(GiB)": 21.51, "elapsed_time": "4h 50m 21s", "remaining_time": "6h 46m 43s", "loss_scale": 1.0, "consumed_samples": 678400, "global_step/max_steps": "2650/6362"} +{"lm loss": 4.96876287, "grad_norm": 0.56305557, "learning_rate": 6.85e-05, "elapsed_time_per_iteration": 6.57212734, "memory(GiB)": 21.51, "elapsed_time": "4h 50m 28s", "remaining_time": "6h 46m 36s", "loss_scale": 1.0, "consumed_samples": 678656, "global_step/max_steps": "2651/6362"} +{"lm loss": 4.94498825, "grad_norm": 0.56572676, "learning_rate": 6.848e-05, "elapsed_time_per_iteration": 6.50500727, "memory(GiB)": 21.51, "elapsed_time": "4h 50m 34s", "remaining_time": "6h 46m 29s", "loss_scale": 1.0, "consumed_samples": 678912, "global_step/max_steps": "2652/6362"} +{"lm loss": 4.98469591, "grad_norm": 0.56428087, "learning_rate": 6.845e-05, "elapsed_time_per_iteration": 6.56996846, "memory(GiB)": 21.51, "elapsed_time": "4h 50m 41s", "remaining_time": "6h 46m 23s", "loss_scale": 1.0, "consumed_samples": 679168, "global_step/max_steps": "2653/6362"} +{"lm loss": 4.95844555, "grad_norm": 0.53692091, "learning_rate": 6.843e-05, "elapsed_time_per_iteration": 6.78952312, "memory(GiB)": 21.51, "elapsed_time": "4h 50m 47s", "remaining_time": "6h 46m 17s", "loss_scale": 1.0, "consumed_samples": 679424, "global_step/max_steps": "2654/6362"} +{"lm loss": 4.98139, "grad_norm": 0.56387854, "learning_rate": 6.841e-05, "elapsed_time_per_iteration": 6.73123908, "memory(GiB)": 21.51, "elapsed_time": "4h 50m 54s", "remaining_time": "6h 46m 10s", "loss_scale": 1.0, "consumed_samples": 679680, "global_step/max_steps": "2655/6362"} +{"lm loss": 4.99126768, "grad_norm": 0.65814316, "learning_rate": 6.838e-05, "elapsed_time_per_iteration": 6.65840244, "memory(GiB)": 21.51, "elapsed_time": "4h 51m 1s", "remaining_time": "6h 46m 4s", "loss_scale": 1.0, "consumed_samples": 679936, "global_step/max_steps": "2656/6362"} +{"lm loss": 4.94141102, "grad_norm": 0.80135602, "learning_rate": 6.836e-05, "elapsed_time_per_iteration": 6.7247436, "memory(GiB)": 21.51, "elapsed_time": "4h 51m 8s", "remaining_time": "6h 45m 57s", "loss_scale": 1.0, "consumed_samples": 680192, "global_step/max_steps": "2657/6362"} +{"lm loss": 4.95722818, "grad_norm": 0.80515391, "learning_rate": 6.834e-05, "elapsed_time_per_iteration": 6.55093455, "memory(GiB)": 21.51, "elapsed_time": "4h 51m 14s", "remaining_time": "6h 45m 51s", "loss_scale": 1.0, "consumed_samples": 680448, "global_step/max_steps": "2658/6362"} +{"lm loss": 4.95174646, "grad_norm": 0.64303517, "learning_rate": 6.831e-05, "elapsed_time_per_iteration": 6.68510628, "memory(GiB)": 21.51, "elapsed_time": "4h 51m 21s", "remaining_time": "6h 45m 44s", "loss_scale": 1.0, "consumed_samples": 680704, "global_step/max_steps": "2659/6362"} +{"lm loss": 4.97795916, "grad_norm": 0.52992409, "learning_rate": 6.829e-05, "elapsed_time_per_iteration": 6.62379146, "memory(GiB)": 21.51, "elapsed_time": "4h 51m 27s", "remaining_time": "6h 45m 38s", "loss_scale": 1.0, "consumed_samples": 680960, "global_step/max_steps": "2660/6362"} +{"lm loss": 4.97933722, "grad_norm": 0.65569925, "learning_rate": 6.827e-05, "elapsed_time_per_iteration": 6.53999734, "memory(GiB)": 21.51, "elapsed_time": "4h 51m 34s", "remaining_time": "6h 45m 31s", "loss_scale": 1.0, "consumed_samples": 681216, "global_step/max_steps": "2661/6362"} +{"lm loss": 4.97735834, "grad_norm": 0.6988225, "learning_rate": 6.824e-05, "elapsed_time_per_iteration": 6.63066268, "memory(GiB)": 21.51, "elapsed_time": "4h 51m 41s", "remaining_time": "6h 45m 25s", "loss_scale": 1.0, "consumed_samples": 681472, "global_step/max_steps": "2662/6362"} +{"lm loss": 4.97063303, "grad_norm": 0.53145748, "learning_rate": 6.822e-05, "elapsed_time_per_iteration": 6.57582068, "memory(GiB)": 21.51, "elapsed_time": "4h 51m 47s", "remaining_time": "6h 45m 18s", "loss_scale": 1.0, "consumed_samples": 681728, "global_step/max_steps": "2663/6362"} +{"lm loss": 4.98012972, "grad_norm": 0.49962202, "learning_rate": 6.819e-05, "elapsed_time_per_iteration": 6.66823697, "memory(GiB)": 21.51, "elapsed_time": "4h 51m 54s", "remaining_time": "6h 45m 12s", "loss_scale": 1.0, "consumed_samples": 681984, "global_step/max_steps": "2664/6362"} +{"lm loss": 4.97602892, "grad_norm": 0.52810967, "learning_rate": 6.817e-05, "elapsed_time_per_iteration": 6.56633258, "memory(GiB)": 21.51, "elapsed_time": "4h 52m 0s", "remaining_time": "6h 45m 5s", "loss_scale": 1.0, "consumed_samples": 682240, "global_step/max_steps": "2665/6362"} +{"lm loss": 4.98011208, "grad_norm": 0.5385552, "learning_rate": 6.815e-05, "elapsed_time_per_iteration": 6.60435319, "memory(GiB)": 21.51, "elapsed_time": "4h 52m 7s", "remaining_time": "6h 44m 59s", "loss_scale": 1.0, "consumed_samples": 682496, "global_step/max_steps": "2666/6362"} +{"lm loss": 4.96157646, "grad_norm": 0.62468076, "learning_rate": 6.812e-05, "elapsed_time_per_iteration": 6.56088233, "memory(GiB)": 21.51, "elapsed_time": "4h 52m 14s", "remaining_time": "6h 44m 52s", "loss_scale": 1.0, "consumed_samples": 682752, "global_step/max_steps": "2667/6362"} +{"lm loss": 4.97076178, "grad_norm": 0.65092099, "learning_rate": 6.81e-05, "elapsed_time_per_iteration": 6.70362067, "memory(GiB)": 21.51, "elapsed_time": "4h 52m 20s", "remaining_time": "6h 44m 46s", "loss_scale": 1.0, "consumed_samples": 683008, "global_step/max_steps": "2668/6362"} +{"lm loss": 4.97190952, "grad_norm": 0.55638933, "learning_rate": 6.808e-05, "elapsed_time_per_iteration": 6.55202055, "memory(GiB)": 21.51, "elapsed_time": "4h 52m 27s", "remaining_time": "6h 44m 39s", "loss_scale": 1.0, "consumed_samples": 683264, "global_step/max_steps": "2669/6362"} +{"lm loss": 4.95269012, "grad_norm": 0.51026553, "learning_rate": 6.805e-05, "elapsed_time_per_iteration": 6.71259284, "memory(GiB)": 21.51, "elapsed_time": "4h 52m 33s", "remaining_time": "6h 44m 33s", "loss_scale": 1.0, "consumed_samples": 683520, "global_step/max_steps": "2670/6362"} +{"lm loss": 4.97086668, "grad_norm": 0.57485479, "learning_rate": 6.803e-05, "elapsed_time_per_iteration": 6.69671202, "memory(GiB)": 21.51, "elapsed_time": "4h 52m 40s", "remaining_time": "6h 44m 26s", "loss_scale": 1.0, "consumed_samples": 683776, "global_step/max_steps": "2671/6362"} +{"lm loss": 4.97352886, "grad_norm": 0.60710162, "learning_rate": 6.801e-05, "elapsed_time_per_iteration": 6.52001953, "memory(GiB)": 21.51, "elapsed_time": "4h 52m 47s", "remaining_time": "6h 44m 20s", "loss_scale": 1.0, "consumed_samples": 684032, "global_step/max_steps": "2672/6362"} +{"lm loss": 4.94950104, "grad_norm": 0.54679358, "learning_rate": 6.798e-05, "elapsed_time_per_iteration": 6.86248899, "memory(GiB)": 21.51, "elapsed_time": "4h 52m 54s", "remaining_time": "6h 44m 13s", "loss_scale": 1.0, "consumed_samples": 684288, "global_step/max_steps": "2673/6362"} +{"lm loss": 4.96923447, "grad_norm": 0.53370386, "learning_rate": 6.796e-05, "elapsed_time_per_iteration": 6.65146327, "memory(GiB)": 21.51, "elapsed_time": "4h 53m 0s", "remaining_time": "6h 44m 7s", "loss_scale": 1.0, "consumed_samples": 684544, "global_step/max_steps": "2674/6362"} +{"lm loss": 4.95873642, "grad_norm": 0.58106518, "learning_rate": 6.793e-05, "elapsed_time_per_iteration": 6.67820382, "memory(GiB)": 21.51, "elapsed_time": "4h 53m 7s", "remaining_time": "6h 44m 1s", "loss_scale": 1.0, "consumed_samples": 684800, "global_step/max_steps": "2675/6362"} +{"lm loss": 4.94566822, "grad_norm": 0.63158351, "learning_rate": 6.791e-05, "elapsed_time_per_iteration": 6.62676477, "memory(GiB)": 21.51, "elapsed_time": "4h 53m 14s", "remaining_time": "6h 43m 54s", "loss_scale": 1.0, "consumed_samples": 685056, "global_step/max_steps": "2676/6362"} +{"lm loss": 4.95849037, "grad_norm": 0.5983749, "learning_rate": 6.789e-05, "elapsed_time_per_iteration": 6.75301981, "memory(GiB)": 21.51, "elapsed_time": "4h 53m 20s", "remaining_time": "6h 43m 48s", "loss_scale": 1.0, "consumed_samples": 685312, "global_step/max_steps": "2677/6362"} +{"lm loss": 4.96678257, "grad_norm": 0.52667236, "learning_rate": 6.786e-05, "elapsed_time_per_iteration": 6.53293133, "memory(GiB)": 21.51, "elapsed_time": "4h 53m 27s", "remaining_time": "6h 43m 41s", "loss_scale": 1.0, "consumed_samples": 685568, "global_step/max_steps": "2678/6362"} +{"lm loss": 4.95929813, "grad_norm": 0.65922123, "learning_rate": 6.784e-05, "elapsed_time_per_iteration": 6.50154471, "memory(GiB)": 21.51, "elapsed_time": "4h 53m 33s", "remaining_time": "6h 43m 34s", "loss_scale": 1.0, "consumed_samples": 685824, "global_step/max_steps": "2679/6362"} +{"lm loss": 4.94176865, "grad_norm": 0.7889601, "learning_rate": 6.782e-05, "elapsed_time_per_iteration": 6.81236529, "memory(GiB)": 21.51, "elapsed_time": "4h 53m 40s", "remaining_time": "6h 43m 28s", "loss_scale": 1.0, "consumed_samples": 686080, "global_step/max_steps": "2680/6362"} +{"lm loss": 4.96840906, "grad_norm": 0.7630859, "learning_rate": 6.779e-05, "elapsed_time_per_iteration": 6.71141195, "memory(GiB)": 21.51, "elapsed_time": "4h 53m 47s", "remaining_time": "6h 43m 22s", "loss_scale": 1.0, "consumed_samples": 686336, "global_step/max_steps": "2681/6362"} +{"lm loss": 4.94840336, "grad_norm": 0.63357151, "learning_rate": 6.777e-05, "elapsed_time_per_iteration": 6.77467299, "memory(GiB)": 21.51, "elapsed_time": "4h 53m 54s", "remaining_time": "6h 43m 15s", "loss_scale": 1.0, "consumed_samples": 686592, "global_step/max_steps": "2682/6362"} +{"lm loss": 4.97070265, "grad_norm": 0.61203492, "learning_rate": 6.774e-05, "elapsed_time_per_iteration": 6.68431711, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 0s", "remaining_time": "6h 43m 9s", "loss_scale": 1.0, "consumed_samples": 686848, "global_step/max_steps": "2683/6362"} +{"lm loss": 4.93995905, "grad_norm": 0.66881043, "learning_rate": 6.772e-05, "elapsed_time_per_iteration": 6.60426879, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 7s", "remaining_time": "6h 43m 2s", "loss_scale": 1.0, "consumed_samples": 687104, "global_step/max_steps": "2684/6362"} +{"lm loss": 4.96952009, "grad_norm": 0.62997693, "learning_rate": 6.77e-05, "elapsed_time_per_iteration": 6.49257588, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 13s", "remaining_time": "6h 42m 56s", "loss_scale": 1.0, "consumed_samples": 687360, "global_step/max_steps": "2685/6362"} +{"lm loss": 4.93683863, "grad_norm": 0.5430904, "learning_rate": 6.767e-05, "elapsed_time_per_iteration": 6.50818658, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 20s", "remaining_time": "6h 42m 49s", "loss_scale": 1.0, "consumed_samples": 687616, "global_step/max_steps": "2686/6362"} +{"lm loss": 4.98396635, "grad_norm": 0.61544305, "learning_rate": 6.765e-05, "elapsed_time_per_iteration": 6.54632449, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 26s", "remaining_time": "6h 42m 43s", "loss_scale": 1.0, "consumed_samples": 687872, "global_step/max_steps": "2687/6362"} +{"lm loss": 4.96252489, "grad_norm": 0.58289909, "learning_rate": 6.763e-05, "elapsed_time_per_iteration": 6.47978234, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 33s", "remaining_time": "6h 42m 36s", "loss_scale": 1.0, "consumed_samples": 688128, "global_step/max_steps": "2688/6362"} +{"lm loss": 4.94216442, "grad_norm": 0.55082572, "learning_rate": 6.76e-05, "elapsed_time_per_iteration": 6.81207681, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 40s", "remaining_time": "6h 42m 30s", "loss_scale": 1.0, "consumed_samples": 688384, "global_step/max_steps": "2689/6362"} +{"lm loss": 4.97631598, "grad_norm": 0.61551464, "learning_rate": 6.758e-05, "elapsed_time_per_iteration": 6.38564157, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 46s", "remaining_time": "6h 42m 23s", "loss_scale": 1.0, "consumed_samples": 688640, "global_step/max_steps": "2690/6362"} +{"lm loss": 4.97276354, "grad_norm": 0.62446946, "learning_rate": 6.755e-05, "elapsed_time_per_iteration": 6.54635143, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 53s", "remaining_time": "6h 42m 16s", "loss_scale": 1.0, "consumed_samples": 688896, "global_step/max_steps": "2691/6362"} +{"lm loss": 4.94783592, "grad_norm": 0.62951803, "learning_rate": 6.753e-05, "elapsed_time_per_iteration": 6.52634501, "memory(GiB)": 21.51, "elapsed_time": "4h 54m 59s", "remaining_time": "6h 42m 9s", "loss_scale": 1.0, "consumed_samples": 689152, "global_step/max_steps": "2692/6362"} +{"lm loss": 5.00203609, "grad_norm": 0.63916701, "learning_rate": 6.751e-05, "elapsed_time_per_iteration": 6.54607081, "memory(GiB)": 21.51, "elapsed_time": "4h 55m 6s", "remaining_time": "6h 42m 3s", "loss_scale": 1.0, "consumed_samples": 689408, "global_step/max_steps": "2693/6362"} +{"lm loss": 4.97467232, "grad_norm": 0.51022512, "learning_rate": 6.748e-05, "elapsed_time_per_iteration": 6.64022183, "memory(GiB)": 21.51, "elapsed_time": "4h 55m 12s", "remaining_time": "6h 41m 56s", "loss_scale": 1.0, "consumed_samples": 689664, "global_step/max_steps": "2694/6362"} +{"lm loss": 4.96655989, "grad_norm": 0.50915164, "learning_rate": 6.746e-05, "elapsed_time_per_iteration": 6.63149953, "memory(GiB)": 21.51, "elapsed_time": "4h 55m 19s", "remaining_time": "6h 41m 50s", "loss_scale": 1.0, "consumed_samples": 689920, "global_step/max_steps": "2695/6362"} +{"lm loss": 4.99017668, "grad_norm": 0.5601725, "learning_rate": 6.743e-05, "elapsed_time_per_iteration": 6.56688738, "memory(GiB)": 21.51, "elapsed_time": "4h 55m 26s", "remaining_time": "6h 41m 43s", "loss_scale": 1.0, "consumed_samples": 690176, "global_step/max_steps": "2696/6362"} +{"lm loss": 4.98003769, "grad_norm": 0.56540173, "learning_rate": 6.741e-05, "elapsed_time_per_iteration": 6.46473026, "memory(GiB)": 21.51, "elapsed_time": "4h 55m 32s", "remaining_time": "6h 41m 37s", "loss_scale": 1.0, "consumed_samples": 690432, "global_step/max_steps": "2697/6362"} +{"lm loss": 4.97376442, "grad_norm": 0.51247847, "learning_rate": 6.739e-05, "elapsed_time_per_iteration": 6.3746376, "memory(GiB)": 21.51, "elapsed_time": "4h 55m 38s", "remaining_time": "6h 41m 30s", "loss_scale": 1.0, "consumed_samples": 690688, "global_step/max_steps": "2698/6362"} +{"lm loss": 4.96039629, "grad_norm": 0.60766047, "learning_rate": 6.736e-05, "elapsed_time_per_iteration": 6.48476362, "memory(GiB)": 21.51, "elapsed_time": "4h 55m 45s", "remaining_time": "6h 41m 23s", "loss_scale": 1.0, "consumed_samples": 690944, "global_step/max_steps": "2699/6362"} +{"lm loss": 4.96300602, "grad_norm": 0.59055036, "learning_rate": 6.734e-05, "elapsed_time_per_iteration": 6.52804351, "memory(GiB)": 21.51, "elapsed_time": "4h 55m 51s", "remaining_time": "6h 41m 16s", "loss_scale": 1.0, "consumed_samples": 691200, "global_step/max_steps": "2700/6362"} +{"lm loss": 4.95827675, "grad_norm": 0.64737856, "learning_rate": 6.732e-05, "elapsed_time_per_iteration": 6.53615999, "memory(GiB)": 21.51, "elapsed_time": "4h 55m 58s", "remaining_time": "6h 41m 10s", "loss_scale": 1.0, "consumed_samples": 691456, "global_step/max_steps": "2701/6362"} +{"lm loss": 4.92623663, "grad_norm": 0.73837203, "learning_rate": 6.729e-05, "elapsed_time_per_iteration": 6.46467328, "memory(GiB)": 21.51, "elapsed_time": "4h 56m 4s", "remaining_time": "6h 41m 3s", "loss_scale": 1.0, "consumed_samples": 691712, "global_step/max_steps": "2702/6362"} +{"lm loss": 4.94686079, "grad_norm": 0.70096117, "learning_rate": 6.727e-05, "elapsed_time_per_iteration": 6.76039648, "memory(GiB)": 21.51, "elapsed_time": "4h 56m 11s", "remaining_time": "6h 40m 57s", "loss_scale": 1.0, "consumed_samples": 691968, "global_step/max_steps": "2703/6362"} +{"lm loss": 4.935256, "grad_norm": 0.59615362, "learning_rate": 6.724e-05, "elapsed_time_per_iteration": 6.85102487, "memory(GiB)": 21.51, "elapsed_time": "4h 56m 18s", "remaining_time": "6h 40m 51s", "loss_scale": 1.0, "consumed_samples": 692224, "global_step/max_steps": "2704/6362"} +{"lm loss": 4.95096874, "grad_norm": 0.59704369, "learning_rate": 6.722e-05, "elapsed_time_per_iteration": 6.8552835, "memory(GiB)": 21.51, "elapsed_time": "4h 56m 25s", "remaining_time": "6h 40m 44s", "loss_scale": 1.0, "consumed_samples": 692480, "global_step/max_steps": "2705/6362"} +{"lm loss": 4.95721436, "grad_norm": 0.58832502, "learning_rate": 6.72e-05, "elapsed_time_per_iteration": 6.88026524, "memory(GiB)": 21.51, "elapsed_time": "4h 56m 32s", "remaining_time": "6h 40m 38s", "loss_scale": 1.0, "consumed_samples": 692736, "global_step/max_steps": "2706/6362"} +{"lm loss": 4.96975899, "grad_norm": 0.57226562, "learning_rate": 6.717e-05, "elapsed_time_per_iteration": 6.57152534, "memory(GiB)": 21.51, "elapsed_time": "4h 56m 38s", "remaining_time": "6h 40m 32s", "loss_scale": 1.0, "consumed_samples": 692992, "global_step/max_steps": "2707/6362"} +{"lm loss": 4.9411869, "grad_norm": 0.56124097, "learning_rate": 6.715e-05, "elapsed_time_per_iteration": 6.60667968, "memory(GiB)": 21.51, "elapsed_time": "4h 56m 45s", "remaining_time": "6h 40m 25s", "loss_scale": 1.0, "consumed_samples": 693248, "global_step/max_steps": "2708/6362"} +{"lm loss": 4.95634747, "grad_norm": 0.57584667, "learning_rate": 6.713e-05, "elapsed_time_per_iteration": 6.56210661, "memory(GiB)": 21.51, "elapsed_time": "4h 56m 52s", "remaining_time": "6h 40m 18s", "loss_scale": 1.0, "consumed_samples": 693504, "global_step/max_steps": "2709/6362"} +{"lm loss": 4.97011089, "grad_norm": 0.55252796, "learning_rate": 6.71e-05, "elapsed_time_per_iteration": 6.4280417, "memory(GiB)": 21.51, "elapsed_time": "4h 56m 58s", "remaining_time": "6h 40m 12s", "loss_scale": 1.0, "consumed_samples": 693760, "global_step/max_steps": "2710/6362"} +{"lm loss": 4.96313667, "grad_norm": 0.51538396, "learning_rate": 6.708e-05, "elapsed_time_per_iteration": 6.63362575, "memory(GiB)": 21.51, "elapsed_time": "4h 57m 5s", "remaining_time": "6h 40m 5s", "loss_scale": 1.0, "consumed_samples": 694016, "global_step/max_steps": "2711/6362"} +{"lm loss": 4.96194649, "grad_norm": 0.57659835, "learning_rate": 6.705e-05, "elapsed_time_per_iteration": 6.57807446, "memory(GiB)": 21.51, "elapsed_time": "4h 57m 11s", "remaining_time": "6h 39m 59s", "loss_scale": 1.0, "consumed_samples": 694272, "global_step/max_steps": "2712/6362"} +{"lm loss": 4.95356655, "grad_norm": 0.51699507, "learning_rate": 6.703e-05, "elapsed_time_per_iteration": 6.39676452, "memory(GiB)": 21.51, "elapsed_time": "4h 57m 18s", "remaining_time": "6h 39m 52s", "loss_scale": 1.0, "consumed_samples": 694528, "global_step/max_steps": "2713/6362"} +{"lm loss": 4.9720335, "grad_norm": 0.63031495, "learning_rate": 6.701e-05, "elapsed_time_per_iteration": 6.49079657, "memory(GiB)": 21.51, "elapsed_time": "4h 57m 24s", "remaining_time": "6h 39m 45s", "loss_scale": 1.0, "consumed_samples": 694784, "global_step/max_steps": "2714/6362"} +{"lm loss": 4.97101831, "grad_norm": 0.67271572, "learning_rate": 6.698e-05, "elapsed_time_per_iteration": 6.53338957, "memory(GiB)": 21.51, "elapsed_time": "4h 57m 31s", "remaining_time": "6h 39m 38s", "loss_scale": 1.0, "consumed_samples": 695040, "global_step/max_steps": "2715/6362"} +{"lm loss": 4.98996592, "grad_norm": 0.6017071, "learning_rate": 6.696e-05, "elapsed_time_per_iteration": 6.58176875, "memory(GiB)": 21.51, "elapsed_time": "4h 57m 37s", "remaining_time": "6h 39m 32s", "loss_scale": 1.0, "consumed_samples": 695296, "global_step/max_steps": "2716/6362"} +{"lm loss": 4.97233105, "grad_norm": 0.57032478, "learning_rate": 6.693e-05, "elapsed_time_per_iteration": 6.50043344, "memory(GiB)": 21.51, "elapsed_time": "4h 57m 44s", "remaining_time": "6h 39m 25s", "loss_scale": 1.0, "consumed_samples": 695552, "global_step/max_steps": "2717/6362"} +{"lm loss": 4.97343445, "grad_norm": 0.52560043, "learning_rate": 6.691e-05, "elapsed_time_per_iteration": 6.53855014, "memory(GiB)": 21.51, "elapsed_time": "4h 57m 50s", "remaining_time": "6h 39m 19s", "loss_scale": 1.0, "consumed_samples": 695808, "global_step/max_steps": "2718/6362"} +{"lm loss": 4.94597435, "grad_norm": 0.57655603, "learning_rate": 6.689e-05, "elapsed_time_per_iteration": 6.43404818, "memory(GiB)": 21.51, "elapsed_time": "4h 57m 57s", "remaining_time": "6h 39m 12s", "loss_scale": 1.0, "consumed_samples": 696064, "global_step/max_steps": "2719/6362"} +{"lm loss": 4.96618986, "grad_norm": 0.60065573, "learning_rate": 6.686e-05, "elapsed_time_per_iteration": 6.6713407, "memory(GiB)": 21.51, "elapsed_time": "4h 58m 3s", "remaining_time": "6h 39m 5s", "loss_scale": 1.0, "consumed_samples": 696320, "global_step/max_steps": "2720/6362"} +{"lm loss": 4.94306993, "grad_norm": 0.5293659, "learning_rate": 6.684e-05, "elapsed_time_per_iteration": 6.53832412, "memory(GiB)": 21.51, "elapsed_time": "4h 58m 10s", "remaining_time": "6h 38m 59s", "loss_scale": 1.0, "consumed_samples": 696576, "global_step/max_steps": "2721/6362"} +{"lm loss": 4.95332098, "grad_norm": 0.62346917, "learning_rate": 6.681e-05, "elapsed_time_per_iteration": 6.357723, "memory(GiB)": 21.51, "elapsed_time": "4h 58m 16s", "remaining_time": "6h 38m 52s", "loss_scale": 1.0, "consumed_samples": 696832, "global_step/max_steps": "2722/6362"} +{"lm loss": 4.95857, "grad_norm": 0.65941614, "learning_rate": 6.679e-05, "elapsed_time_per_iteration": 6.51305103, "memory(GiB)": 21.51, "elapsed_time": "4h 58m 23s", "remaining_time": "6h 38m 45s", "loss_scale": 1.0, "consumed_samples": 697088, "global_step/max_steps": "2723/6362"} +{"lm loss": 4.95574951, "grad_norm": 0.64956188, "learning_rate": 6.677e-05, "elapsed_time_per_iteration": 6.53768015, "memory(GiB)": 21.51, "elapsed_time": "4h 58m 29s", "remaining_time": "6h 38m 39s", "loss_scale": 1.0, "consumed_samples": 697344, "global_step/max_steps": "2724/6362"} +{"lm loss": 4.94896364, "grad_norm": 0.6667732, "learning_rate": 6.674e-05, "elapsed_time_per_iteration": 6.65590215, "memory(GiB)": 21.51, "elapsed_time": "4h 58m 36s", "remaining_time": "6h 38m 32s", "loss_scale": 1.0, "consumed_samples": 697600, "global_step/max_steps": "2725/6362"} +{"lm loss": 4.9642458, "grad_norm": 0.64034158, "learning_rate": 6.672e-05, "elapsed_time_per_iteration": 6.69181347, "memory(GiB)": 21.51, "elapsed_time": "4h 58m 43s", "remaining_time": "6h 38m 26s", "loss_scale": 1.0, "consumed_samples": 697856, "global_step/max_steps": "2726/6362"} +{"lm loss": 4.96854162, "grad_norm": 0.61988378, "learning_rate": 6.669e-05, "elapsed_time_per_iteration": 6.75378108, "memory(GiB)": 21.51, "elapsed_time": "4h 58m 49s", "remaining_time": "6h 38m 19s", "loss_scale": 1.0, "consumed_samples": 698112, "global_step/max_steps": "2727/6362"} +{"lm loss": 4.9739728, "grad_norm": 0.51229525, "learning_rate": 6.667e-05, "elapsed_time_per_iteration": 6.53548694, "memory(GiB)": 21.51, "elapsed_time": "4h 58m 56s", "remaining_time": "6h 38m 13s", "loss_scale": 1.0, "consumed_samples": 698368, "global_step/max_steps": "2728/6362"} +{"lm loss": 4.96883106, "grad_norm": 0.51949632, "learning_rate": 6.665e-05, "elapsed_time_per_iteration": 6.67497873, "memory(GiB)": 21.51, "elapsed_time": "4h 59m 3s", "remaining_time": "6h 38m 6s", "loss_scale": 1.0, "consumed_samples": 698624, "global_step/max_steps": "2729/6362"} +{"lm loss": 4.95930433, "grad_norm": 0.56322861, "learning_rate": 6.662e-05, "elapsed_time_per_iteration": 6.84526753, "memory(GiB)": 21.51, "elapsed_time": "4h 59m 9s", "remaining_time": "6h 38m 0s", "loss_scale": 1.0, "consumed_samples": 698880, "global_step/max_steps": "2730/6362"} +{"lm loss": 4.95681047, "grad_norm": 0.61422211, "learning_rate": 6.66e-05, "elapsed_time_per_iteration": 6.41147876, "memory(GiB)": 21.51, "elapsed_time": "4h 59m 16s", "remaining_time": "6h 37m 53s", "loss_scale": 1.0, "consumed_samples": 699136, "global_step/max_steps": "2731/6362"} +{"lm loss": 4.98262835, "grad_norm": 0.55888844, "learning_rate": 6.658e-05, "elapsed_time_per_iteration": 6.56038356, "memory(GiB)": 21.51, "elapsed_time": "4h 59m 22s", "remaining_time": "6h 37m 47s", "loss_scale": 1.0, "consumed_samples": 699392, "global_step/max_steps": "2732/6362"} +{"lm loss": 4.95939493, "grad_norm": 0.4871701, "learning_rate": 6.655e-05, "elapsed_time_per_iteration": 6.61495352, "memory(GiB)": 21.51, "elapsed_time": "4h 59m 29s", "remaining_time": "6h 37m 40s", "loss_scale": 1.0, "consumed_samples": 699648, "global_step/max_steps": "2733/6362"} +{"lm loss": 4.97360754, "grad_norm": 0.56306487, "learning_rate": 6.653e-05, "elapsed_time_per_iteration": 6.55907464, "memory(GiB)": 21.51, "elapsed_time": "4h 59m 36s", "remaining_time": "6h 37m 34s", "loss_scale": 1.0, "consumed_samples": 699904, "global_step/max_steps": "2734/6362"} +{"lm loss": 4.94139194, "grad_norm": 0.67936671, "learning_rate": 6.65e-05, "elapsed_time_per_iteration": 6.69964981, "memory(GiB)": 21.51, "elapsed_time": "4h 59m 42s", "remaining_time": "6h 37m 27s", "loss_scale": 1.0, "consumed_samples": 700160, "global_step/max_steps": "2735/6362"} +{"lm loss": 4.98301935, "grad_norm": 0.73260403, "learning_rate": 6.648e-05, "elapsed_time_per_iteration": 6.65108943, "memory(GiB)": 21.51, "elapsed_time": "4h 59m 49s", "remaining_time": "6h 37m 21s", "loss_scale": 1.0, "consumed_samples": 700416, "global_step/max_steps": "2736/6362"} +{"lm loss": 4.94182158, "grad_norm": 0.77548093, "learning_rate": 6.646e-05, "elapsed_time_per_iteration": 6.55474806, "memory(GiB)": 21.51, "elapsed_time": "4h 59m 55s", "remaining_time": "6h 37m 14s", "loss_scale": 1.0, "consumed_samples": 700672, "global_step/max_steps": "2737/6362"} +{"lm loss": 4.97628784, "grad_norm": 0.73006397, "learning_rate": 6.643e-05, "elapsed_time_per_iteration": 6.65901256, "memory(GiB)": 21.51, "elapsed_time": "5h 0m 2s", "remaining_time": "6h 37m 8s", "loss_scale": 1.0, "consumed_samples": 700928, "global_step/max_steps": "2738/6362"} +{"lm loss": 4.9508543, "grad_norm": 0.57116646, "learning_rate": 6.641e-05, "elapsed_time_per_iteration": 6.51697278, "memory(GiB)": 21.51, "elapsed_time": "5h 0m 9s", "remaining_time": "6h 37m 1s", "loss_scale": 1.0, "consumed_samples": 701184, "global_step/max_steps": "2739/6362"} +{"lm loss": 4.97799635, "grad_norm": 0.57032001, "learning_rate": 6.638e-05, "elapsed_time_per_iteration": 6.78956676, "memory(GiB)": 21.51, "elapsed_time": "5h 0m 15s", "remaining_time": "6h 36m 55s", "loss_scale": 1.0, "consumed_samples": 701440, "global_step/max_steps": "2740/6362"} +{"lm loss": 4.96808958, "grad_norm": 0.54988939, "learning_rate": 6.636e-05, "elapsed_time_per_iteration": 6.76315165, "memory(GiB)": 21.51, "elapsed_time": "5h 0m 22s", "remaining_time": "6h 36m 48s", "loss_scale": 1.0, "consumed_samples": 701696, "global_step/max_steps": "2741/6362"} +{"lm loss": 4.96030664, "grad_norm": 0.55976748, "learning_rate": 6.634e-05, "elapsed_time_per_iteration": 6.69271731, "memory(GiB)": 21.51, "elapsed_time": "5h 0m 29s", "remaining_time": "6h 36m 42s", "loss_scale": 1.0, "consumed_samples": 701952, "global_step/max_steps": "2742/6362"} +{"lm loss": 4.94360876, "grad_norm": 0.60229379, "learning_rate": 6.631e-05, "elapsed_time_per_iteration": 6.81376839, "memory(GiB)": 21.51, "elapsed_time": "5h 0m 36s", "remaining_time": "6h 36m 36s", "loss_scale": 1.0, "consumed_samples": 702208, "global_step/max_steps": "2743/6362"} +{"lm loss": 4.95830297, "grad_norm": 0.57322162, "learning_rate": 6.629e-05, "elapsed_time_per_iteration": 6.36385989, "memory(GiB)": 21.51, "elapsed_time": "5h 0m 42s", "remaining_time": "6h 36m 29s", "loss_scale": 1.0, "consumed_samples": 702464, "global_step/max_steps": "2744/6362"} +{"lm loss": 4.9727602, "grad_norm": 0.51876652, "learning_rate": 6.626e-05, "elapsed_time_per_iteration": 6.52684832, "memory(GiB)": 21.51, "elapsed_time": "5h 0m 49s", "remaining_time": "6h 36m 22s", "loss_scale": 1.0, "consumed_samples": 702720, "global_step/max_steps": "2745/6362"} +{"lm loss": 4.95355749, "grad_norm": 0.55712032, "learning_rate": 6.624e-05, "elapsed_time_per_iteration": 6.44353342, "memory(GiB)": 21.51, "elapsed_time": "5h 0m 55s", "remaining_time": "6h 36m 15s", "loss_scale": 1.0, "consumed_samples": 702976, "global_step/max_steps": "2746/6362"} +{"lm loss": 4.94811583, "grad_norm": 0.53926092, "learning_rate": 6.622e-05, "elapsed_time_per_iteration": 6.73137116, "memory(GiB)": 21.51, "elapsed_time": "5h 1m 2s", "remaining_time": "6h 36m 9s", "loss_scale": 1.0, "consumed_samples": 703232, "global_step/max_steps": "2747/6362"} +{"lm loss": 4.95720434, "grad_norm": 0.59186715, "learning_rate": 6.619e-05, "elapsed_time_per_iteration": 6.4306922, "memory(GiB)": 21.51, "elapsed_time": "5h 1m 8s", "remaining_time": "6h 36m 2s", "loss_scale": 1.0, "consumed_samples": 703488, "global_step/max_steps": "2748/6362"} +{"lm loss": 4.94644499, "grad_norm": 0.68590617, "learning_rate": 6.617e-05, "elapsed_time_per_iteration": 6.61145759, "memory(GiB)": 21.51, "elapsed_time": "5h 1m 15s", "remaining_time": "6h 35m 56s", "loss_scale": 1.0, "consumed_samples": 703744, "global_step/max_steps": "2749/6362"} +{"lm loss": 4.96795321, "grad_norm": 0.6861043, "learning_rate": 6.614e-05, "elapsed_time_per_iteration": 6.87359333, "memory(GiB)": 21.51, "elapsed_time": "5h 1m 22s", "remaining_time": "6h 35m 50s", "loss_scale": 1.0, "consumed_samples": 704000, "global_step/max_steps": "2750/6362"} +{"lm loss": 4.97350883, "grad_norm": 0.5974977, "learning_rate": 6.612e-05, "elapsed_time_per_iteration": 6.52181411, "memory(GiB)": 21.51, "elapsed_time": "5h 1m 28s", "remaining_time": "6h 35m 43s", "loss_scale": 1.0, "consumed_samples": 704256, "global_step/max_steps": "2751/6362"} +{"lm loss": 4.99991608, "grad_norm": 0.52507973, "learning_rate": 6.61e-05, "elapsed_time_per_iteration": 6.4691484, "memory(GiB)": 21.51, "elapsed_time": "5h 1m 35s", "remaining_time": "6h 35m 36s", "loss_scale": 1.0, "consumed_samples": 704512, "global_step/max_steps": "2752/6362"} +{"lm loss": 4.96999311, "grad_norm": 0.5640462, "learning_rate": 6.607e-05, "elapsed_time_per_iteration": 6.83377957, "memory(GiB)": 21.51, "elapsed_time": "5h 1m 42s", "remaining_time": "6h 35m 30s", "loss_scale": 1.0, "consumed_samples": 704768, "global_step/max_steps": "2753/6362"} +{"lm loss": 4.94603682, "grad_norm": 0.57203978, "learning_rate": 6.605e-05, "elapsed_time_per_iteration": 6.67642593, "memory(GiB)": 21.51, "elapsed_time": "5h 1m 48s", "remaining_time": "6h 35m 24s", "loss_scale": 1.0, "consumed_samples": 705024, "global_step/max_steps": "2754/6362"} +{"lm loss": 4.93575478, "grad_norm": 0.60087985, "learning_rate": 6.602e-05, "elapsed_time_per_iteration": 6.42829561, "memory(GiB)": 21.51, "elapsed_time": "5h 1m 55s", "remaining_time": "6h 35m 17s", "loss_scale": 1.0, "consumed_samples": 705280, "global_step/max_steps": "2755/6362"} +{"lm loss": 4.97033739, "grad_norm": 0.6370272, "learning_rate": 6.6e-05, "elapsed_time_per_iteration": 6.79707527, "memory(GiB)": 21.51, "elapsed_time": "5h 2m 1s", "remaining_time": "6h 35m 11s", "loss_scale": 1.0, "consumed_samples": 705536, "global_step/max_steps": "2756/6362"} +{"lm loss": 4.96242189, "grad_norm": 0.57392389, "learning_rate": 6.597e-05, "elapsed_time_per_iteration": 6.4826479, "memory(GiB)": 21.51, "elapsed_time": "5h 2m 8s", "remaining_time": "6h 35m 4s", "loss_scale": 1.0, "consumed_samples": 705792, "global_step/max_steps": "2757/6362"} +{"lm loss": 4.97671652, "grad_norm": 0.59531337, "learning_rate": 6.595e-05, "elapsed_time_per_iteration": 6.42592311, "memory(GiB)": 21.51, "elapsed_time": "5h 2m 14s", "remaining_time": "6h 34m 57s", "loss_scale": 1.0, "consumed_samples": 706048, "global_step/max_steps": "2758/6362"} +{"lm loss": 4.95644283, "grad_norm": 0.52585787, "learning_rate": 6.593e-05, "elapsed_time_per_iteration": 6.54599428, "memory(GiB)": 21.51, "elapsed_time": "5h 2m 21s", "remaining_time": "6h 34m 50s", "loss_scale": 1.0, "consumed_samples": 706304, "global_step/max_steps": "2759/6362"} +{"lm loss": 4.95117426, "grad_norm": 0.54381186, "learning_rate": 6.59e-05, "elapsed_time_per_iteration": 6.5948956, "memory(GiB)": 21.51, "elapsed_time": "5h 2m 27s", "remaining_time": "6h 34m 44s", "loss_scale": 1.0, "consumed_samples": 706560, "global_step/max_steps": "2760/6362"} +{"lm loss": 4.96991301, "grad_norm": 0.61609352, "learning_rate": 6.588e-05, "elapsed_time_per_iteration": 6.55440354, "memory(GiB)": 21.51, "elapsed_time": "5h 2m 34s", "remaining_time": "6h 34m 37s", "loss_scale": 1.0, "consumed_samples": 706816, "global_step/max_steps": "2761/6362"} +{"lm loss": 4.96284676, "grad_norm": 0.59254318, "learning_rate": 6.585e-05, "elapsed_time_per_iteration": 6.50632954, "memory(GiB)": 21.51, "elapsed_time": "5h 2m 41s", "remaining_time": "6h 34m 31s", "loss_scale": 1.0, "consumed_samples": 707072, "global_step/max_steps": "2762/6362"} +{"lm loss": 4.96434879, "grad_norm": 0.5389297, "learning_rate": 6.583e-05, "elapsed_time_per_iteration": 6.65857053, "memory(GiB)": 21.51, "elapsed_time": "5h 2m 47s", "remaining_time": "6h 34m 24s", "loss_scale": 1.0, "consumed_samples": 707328, "global_step/max_steps": "2763/6362"} +{"lm loss": 4.96849728, "grad_norm": 0.6036523, "learning_rate": 6.581e-05, "elapsed_time_per_iteration": 6.71067905, "memory(GiB)": 21.51, "elapsed_time": "5h 2m 54s", "remaining_time": "6h 34m 18s", "loss_scale": 1.0, "consumed_samples": 707584, "global_step/max_steps": "2764/6362"} +{"lm loss": 4.95506239, "grad_norm": 0.60069108, "learning_rate": 6.578e-05, "elapsed_time_per_iteration": 6.48660684, "memory(GiB)": 21.51, "elapsed_time": "5h 3m 0s", "remaining_time": "6h 34m 11s", "loss_scale": 1.0, "consumed_samples": 707840, "global_step/max_steps": "2765/6362"} +{"lm loss": 4.94679594, "grad_norm": 0.63936013, "learning_rate": 6.576e-05, "elapsed_time_per_iteration": 6.53302622, "memory(GiB)": 21.51, "elapsed_time": "5h 3m 7s", "remaining_time": "6h 34m 4s", "loss_scale": 1.0, "consumed_samples": 708096, "global_step/max_steps": "2766/6362"} +{"lm loss": 4.97222328, "grad_norm": 0.70666414, "learning_rate": 6.573e-05, "elapsed_time_per_iteration": 6.49956036, "memory(GiB)": 21.51, "elapsed_time": "5h 3m 13s", "remaining_time": "6h 33m 58s", "loss_scale": 1.0, "consumed_samples": 708352, "global_step/max_steps": "2767/6362"} +{"lm loss": 4.933599, "grad_norm": 0.62258726, "learning_rate": 6.571e-05, "elapsed_time_per_iteration": 6.5033958, "memory(GiB)": 21.51, "elapsed_time": "5h 3m 20s", "remaining_time": "6h 33m 51s", "loss_scale": 1.0, "consumed_samples": 708608, "global_step/max_steps": "2768/6362"} +{"lm loss": 4.98255873, "grad_norm": 0.5736782, "learning_rate": 6.569e-05, "elapsed_time_per_iteration": 6.61309433, "memory(GiB)": 21.51, "elapsed_time": "5h 3m 27s", "remaining_time": "6h 33m 45s", "loss_scale": 1.0, "consumed_samples": 708864, "global_step/max_steps": "2769/6362"} +{"lm loss": 4.96529531, "grad_norm": 0.67218226, "learning_rate": 6.566e-05, "elapsed_time_per_iteration": 6.75882244, "memory(GiB)": 21.51, "elapsed_time": "5h 3m 33s", "remaining_time": "6h 33m 38s", "loss_scale": 1.0, "consumed_samples": 709120, "global_step/max_steps": "2770/6362"} +{"lm loss": 4.94707775, "grad_norm": 0.745857, "learning_rate": 6.564e-05, "elapsed_time_per_iteration": 6.63456535, "memory(GiB)": 21.51, "elapsed_time": "5h 3m 40s", "remaining_time": "6h 33m 32s", "loss_scale": 1.0, "consumed_samples": 709376, "global_step/max_steps": "2771/6362"} +{"lm loss": 4.93474293, "grad_norm": 0.63458604, "learning_rate": 6.561e-05, "elapsed_time_per_iteration": 6.67910051, "memory(GiB)": 21.51, "elapsed_time": "5h 3m 47s", "remaining_time": "6h 33m 25s", "loss_scale": 1.0, "consumed_samples": 709632, "global_step/max_steps": "2772/6362"} +{"lm loss": 4.94847918, "grad_norm": 0.60391128, "learning_rate": 6.559e-05, "elapsed_time_per_iteration": 6.70609641, "memory(GiB)": 21.51, "elapsed_time": "5h 3m 53s", "remaining_time": "6h 33m 19s", "loss_scale": 1.0, "consumed_samples": 709888, "global_step/max_steps": "2773/6362"} +{"lm loss": 4.92935324, "grad_norm": 0.57491159, "learning_rate": 6.557e-05, "elapsed_time_per_iteration": 6.46403122, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 0s", "remaining_time": "6h 33m 12s", "loss_scale": 1.0, "consumed_samples": 710144, "global_step/max_steps": "2774/6362"} +{"lm loss": 4.94528627, "grad_norm": 0.50700277, "learning_rate": 6.554e-05, "elapsed_time_per_iteration": 6.54267073, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 6s", "remaining_time": "6h 33m 6s", "loss_scale": 1.0, "consumed_samples": 710400, "global_step/max_steps": "2775/6362"} +{"lm loss": 4.97704554, "grad_norm": 0.59225732, "learning_rate": 6.552e-05, "elapsed_time_per_iteration": 6.68371987, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 13s", "remaining_time": "6h 32m 59s", "loss_scale": 1.0, "consumed_samples": 710656, "global_step/max_steps": "2776/6362"} +{"lm loss": 4.95395231, "grad_norm": 0.59769613, "learning_rate": 6.549e-05, "elapsed_time_per_iteration": 6.55528593, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 20s", "remaining_time": "6h 32m 53s", "loss_scale": 1.0, "consumed_samples": 710912, "global_step/max_steps": "2777/6362"} +{"lm loss": 4.95376873, "grad_norm": 0.5645225, "learning_rate": 6.547e-05, "elapsed_time_per_iteration": 6.54870725, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 26s", "remaining_time": "6h 32m 46s", "loss_scale": 1.0, "consumed_samples": 711168, "global_step/max_steps": "2778/6362"} +{"lm loss": 4.95537996, "grad_norm": 0.63895887, "learning_rate": 6.544e-05, "elapsed_time_per_iteration": 6.79418325, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 33s", "remaining_time": "6h 32m 40s", "loss_scale": 1.0, "consumed_samples": 711424, "global_step/max_steps": "2779/6362"} +{"lm loss": 4.97335243, "grad_norm": 0.53159434, "learning_rate": 6.542e-05, "elapsed_time_per_iteration": 6.62515759, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 40s", "remaining_time": "6h 32m 33s", "loss_scale": 1.0, "consumed_samples": 711680, "global_step/max_steps": "2780/6362"} +{"lm loss": 4.97775364, "grad_norm": 0.57353318, "learning_rate": 6.54e-05, "elapsed_time_per_iteration": 6.61720967, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 46s", "remaining_time": "6h 32m 27s", "loss_scale": 1.0, "consumed_samples": 711936, "global_step/max_steps": "2781/6362"} +{"lm loss": 4.94978666, "grad_norm": 0.62263167, "learning_rate": 6.537e-05, "elapsed_time_per_iteration": 6.37842298, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 53s", "remaining_time": "6h 32m 20s", "loss_scale": 1.0, "consumed_samples": 712192, "global_step/max_steps": "2782/6362"} +{"lm loss": 4.9896965, "grad_norm": 0.63623011, "learning_rate": 6.535e-05, "elapsed_time_per_iteration": 6.56315494, "memory(GiB)": 21.51, "elapsed_time": "5h 4m 59s", "remaining_time": "6h 32m 13s", "loss_scale": 1.0, "consumed_samples": 712448, "global_step/max_steps": "2783/6362"} +{"lm loss": 4.95016909, "grad_norm": 0.58421373, "learning_rate": 6.532e-05, "elapsed_time_per_iteration": 6.81528997, "memory(GiB)": 21.51, "elapsed_time": "5h 5m 6s", "remaining_time": "6h 32m 7s", "loss_scale": 1.0, "consumed_samples": 712704, "global_step/max_steps": "2784/6362"} +{"lm loss": 4.95066023, "grad_norm": 0.53559774, "learning_rate": 6.53e-05, "elapsed_time_per_iteration": 6.47383189, "memory(GiB)": 21.51, "elapsed_time": "5h 5m 12s", "remaining_time": "6h 32m 0s", "loss_scale": 1.0, "consumed_samples": 712960, "global_step/max_steps": "2785/6362"} +{"lm loss": 4.96456003, "grad_norm": 0.56368697, "learning_rate": 6.528e-05, "elapsed_time_per_iteration": 6.55487585, "memory(GiB)": 21.51, "elapsed_time": "5h 5m 19s", "remaining_time": "6h 31m 54s", "loss_scale": 1.0, "consumed_samples": 713216, "global_step/max_steps": "2786/6362"} +{"lm loss": 4.98614883, "grad_norm": 0.5226512, "learning_rate": 6.525e-05, "elapsed_time_per_iteration": 6.66663623, "memory(GiB)": 21.51, "elapsed_time": "5h 5m 26s", "remaining_time": "6h 31m 47s", "loss_scale": 1.0, "consumed_samples": 713472, "global_step/max_steps": "2787/6362"} +{"lm loss": 4.95589638, "grad_norm": 0.57582831, "learning_rate": 6.523e-05, "elapsed_time_per_iteration": 6.45791006, "memory(GiB)": 21.51, "elapsed_time": "5h 5m 32s", "remaining_time": "6h 31m 40s", "loss_scale": 1.0, "consumed_samples": 713728, "global_step/max_steps": "2788/6362"} +{"lm loss": 4.94876671, "grad_norm": 0.60254222, "learning_rate": 6.52e-05, "elapsed_time_per_iteration": 6.40273261, "memory(GiB)": 21.51, "elapsed_time": "5h 5m 38s", "remaining_time": "6h 31m 34s", "loss_scale": 1.0, "consumed_samples": 713984, "global_step/max_steps": "2789/6362"} +{"lm loss": 4.95341635, "grad_norm": 0.62690705, "learning_rate": 6.518e-05, "elapsed_time_per_iteration": 6.56997299, "memory(GiB)": 21.51, "elapsed_time": "5h 5m 45s", "remaining_time": "6h 31m 27s", "loss_scale": 1.0, "consumed_samples": 714240, "global_step/max_steps": "2790/6362"} +{"lm loss": 4.97555017, "grad_norm": 0.64719015, "learning_rate": 6.515e-05, "elapsed_time_per_iteration": 6.69631481, "memory(GiB)": 21.51, "elapsed_time": "5h 5m 52s", "remaining_time": "6h 31m 21s", "loss_scale": 1.0, "consumed_samples": 714496, "global_step/max_steps": "2791/6362"} +{"lm loss": 4.93828249, "grad_norm": 0.5963105, "learning_rate": 6.513e-05, "elapsed_time_per_iteration": 6.4674046, "memory(GiB)": 21.51, "elapsed_time": "5h 5m 58s", "remaining_time": "6h 31m 14s", "loss_scale": 1.0, "consumed_samples": 714752, "global_step/max_steps": "2792/6362"} +{"lm loss": 4.97116852, "grad_norm": 0.64127117, "learning_rate": 6.511e-05, "elapsed_time_per_iteration": 6.93975472, "memory(GiB)": 21.51, "elapsed_time": "5h 6m 5s", "remaining_time": "6h 31m 8s", "loss_scale": 1.0, "consumed_samples": 715008, "global_step/max_steps": "2793/6362"} +{"lm loss": 4.9593277, "grad_norm": 0.6469751, "learning_rate": 6.508e-05, "elapsed_time_per_iteration": 6.42481017, "memory(GiB)": 21.51, "elapsed_time": "5h 6m 12s", "remaining_time": "6h 31m 1s", "loss_scale": 1.0, "consumed_samples": 715264, "global_step/max_steps": "2794/6362"} +{"lm loss": 4.94905519, "grad_norm": 0.65266055, "learning_rate": 6.506e-05, "elapsed_time_per_iteration": 6.45848703, "memory(GiB)": 21.51, "elapsed_time": "5h 6m 18s", "remaining_time": "6h 30m 54s", "loss_scale": 1.0, "consumed_samples": 715520, "global_step/max_steps": "2795/6362"} +{"lm loss": 4.95163822, "grad_norm": 0.59221405, "learning_rate": 6.503e-05, "elapsed_time_per_iteration": 6.72657084, "memory(GiB)": 21.51, "elapsed_time": "5h 6m 25s", "remaining_time": "6h 30m 48s", "loss_scale": 1.0, "consumed_samples": 715776, "global_step/max_steps": "2796/6362"} +{"lm loss": 4.95068693, "grad_norm": 0.52281231, "learning_rate": 6.501e-05, "elapsed_time_per_iteration": 6.4782517, "memory(GiB)": 21.51, "elapsed_time": "5h 6m 31s", "remaining_time": "6h 30m 41s", "loss_scale": 1.0, "consumed_samples": 716032, "global_step/max_steps": "2797/6362"} +{"lm loss": 4.94396114, "grad_norm": 0.48044205, "learning_rate": 6.499e-05, "elapsed_time_per_iteration": 6.48529911, "memory(GiB)": 21.51, "elapsed_time": "5h 6m 38s", "remaining_time": "6h 30m 35s", "loss_scale": 1.0, "consumed_samples": 716288, "global_step/max_steps": "2798/6362"} +{"lm loss": 4.9606266, "grad_norm": 0.56260532, "learning_rate": 6.496e-05, "elapsed_time_per_iteration": 6.63334775, "memory(GiB)": 21.51, "elapsed_time": "5h 6m 44s", "remaining_time": "6h 30m 28s", "loss_scale": 1.0, "consumed_samples": 716544, "global_step/max_steps": "2799/6362"} +{"lm loss": 4.96473074, "grad_norm": 0.56996089, "learning_rate": 6.494e-05, "elapsed_time_per_iteration": 6.61943269, "memory(GiB)": 21.51, "elapsed_time": "5h 6m 51s", "remaining_time": "6h 30m 21s", "loss_scale": 1.0, "consumed_samples": 716800, "global_step/max_steps": "2800/6362"} +{"lm loss": 4.96448851, "grad_norm": 0.54599619, "learning_rate": 6.491e-05, "elapsed_time_per_iteration": 6.51207709, "memory(GiB)": 21.51, "elapsed_time": "5h 6m 57s", "remaining_time": "6h 30m 15s", "loss_scale": 1.0, "consumed_samples": 717056, "global_step/max_steps": "2801/6362"} +{"lm loss": 4.95985746, "grad_norm": 0.5842492, "learning_rate": 6.489e-05, "elapsed_time_per_iteration": 6.55137658, "memory(GiB)": 21.51, "elapsed_time": "5h 7m 4s", "remaining_time": "6h 30m 8s", "loss_scale": 1.0, "consumed_samples": 717312, "global_step/max_steps": "2802/6362"} +{"lm loss": 4.99152279, "grad_norm": 0.55540466, "learning_rate": 6.486e-05, "elapsed_time_per_iteration": 6.44272804, "memory(GiB)": 21.51, "elapsed_time": "5h 7m 10s", "remaining_time": "6h 30m 1s", "loss_scale": 1.0, "consumed_samples": 717568, "global_step/max_steps": "2803/6362"} +{"lm loss": 4.92719078, "grad_norm": 0.52318478, "learning_rate": 6.484e-05, "elapsed_time_per_iteration": 6.50836706, "memory(GiB)": 21.51, "elapsed_time": "5h 7m 17s", "remaining_time": "6h 29m 55s", "loss_scale": 1.0, "consumed_samples": 717824, "global_step/max_steps": "2804/6362"} +{"lm loss": 4.94219637, "grad_norm": 0.56337065, "learning_rate": 6.482e-05, "elapsed_time_per_iteration": 6.52701426, "memory(GiB)": 21.51, "elapsed_time": "5h 7m 23s", "remaining_time": "6h 29m 48s", "loss_scale": 1.0, "consumed_samples": 718080, "global_step/max_steps": "2805/6362"} +{"lm loss": 4.94890165, "grad_norm": 0.59488797, "learning_rate": 6.479e-05, "elapsed_time_per_iteration": 6.66131783, "memory(GiB)": 21.51, "elapsed_time": "5h 7m 30s", "remaining_time": "6h 29m 42s", "loss_scale": 1.0, "consumed_samples": 718336, "global_step/max_steps": "2806/6362"} +{"lm loss": 4.97325325, "grad_norm": 0.4762378, "learning_rate": 6.477e-05, "elapsed_time_per_iteration": 6.51002908, "memory(GiB)": 21.51, "elapsed_time": "5h 7m 37s", "remaining_time": "6h 29m 35s", "loss_scale": 1.0, "consumed_samples": 718592, "global_step/max_steps": "2807/6362"} +{"lm loss": 4.94314718, "grad_norm": 0.51073837, "learning_rate": 6.474e-05, "elapsed_time_per_iteration": 6.566751, "memory(GiB)": 21.51, "elapsed_time": "5h 7m 43s", "remaining_time": "6h 29m 28s", "loss_scale": 1.0, "consumed_samples": 718848, "global_step/max_steps": "2808/6362"} +{"lm loss": 4.95969629, "grad_norm": 0.52471733, "learning_rate": 6.472e-05, "elapsed_time_per_iteration": 6.51189446, "memory(GiB)": 21.51, "elapsed_time": "5h 7m 50s", "remaining_time": "6h 29m 22s", "loss_scale": 1.0, "consumed_samples": 719104, "global_step/max_steps": "2809/6362"} +{"lm loss": 4.98241997, "grad_norm": 0.52652198, "learning_rate": 6.469e-05, "elapsed_time_per_iteration": 6.50846648, "memory(GiB)": 21.51, "elapsed_time": "5h 7m 56s", "remaining_time": "6h 29m 15s", "loss_scale": 1.0, "consumed_samples": 719360, "global_step/max_steps": "2810/6362"} +{"lm loss": 4.9623785, "grad_norm": 0.58707225, "learning_rate": 6.467e-05, "elapsed_time_per_iteration": 6.53948426, "memory(GiB)": 21.51, "elapsed_time": "5h 8m 3s", "remaining_time": "6h 29m 9s", "loss_scale": 1.0, "consumed_samples": 719616, "global_step/max_steps": "2811/6362"} +{"lm loss": 4.92941284, "grad_norm": 0.54664648, "learning_rate": 6.465e-05, "elapsed_time_per_iteration": 6.78712821, "memory(GiB)": 21.51, "elapsed_time": "5h 8m 10s", "remaining_time": "6h 29m 2s", "loss_scale": 1.0, "consumed_samples": 719872, "global_step/max_steps": "2812/6362"} +{"lm loss": 4.94436646, "grad_norm": 0.47763062, "learning_rate": 6.462e-05, "elapsed_time_per_iteration": 6.7201879, "memory(GiB)": 21.51, "elapsed_time": "5h 8m 16s", "remaining_time": "6h 28m 56s", "loss_scale": 1.0, "consumed_samples": 720128, "global_step/max_steps": "2813/6362"} +{"lm loss": 4.97200394, "grad_norm": 0.52452445, "learning_rate": 6.46e-05, "elapsed_time_per_iteration": 6.59393811, "memory(GiB)": 21.51, "elapsed_time": "5h 8m 23s", "remaining_time": "6h 28m 49s", "loss_scale": 1.0, "consumed_samples": 720384, "global_step/max_steps": "2814/6362"} +{"lm loss": 4.97961283, "grad_norm": 0.54875863, "learning_rate": 6.457e-05, "elapsed_time_per_iteration": 6.48500085, "memory(GiB)": 21.51, "elapsed_time": "5h 8m 29s", "remaining_time": "6h 28m 43s", "loss_scale": 1.0, "consumed_samples": 720640, "global_step/max_steps": "2815/6362"} +{"lm loss": 4.95944023, "grad_norm": 0.56675476, "learning_rate": 6.455e-05, "elapsed_time_per_iteration": 6.27151823, "memory(GiB)": 21.51, "elapsed_time": "5h 8m 36s", "remaining_time": "6h 28m 36s", "loss_scale": 1.0, "consumed_samples": 720896, "global_step/max_steps": "2816/6362"} +{"lm loss": 4.95301485, "grad_norm": 0.50037909, "learning_rate": 6.452e-05, "elapsed_time_per_iteration": 6.69896865, "memory(GiB)": 21.51, "elapsed_time": "5h 8m 42s", "remaining_time": "6h 28m 29s", "loss_scale": 1.0, "consumed_samples": 721152, "global_step/max_steps": "2817/6362"} +{"lm loss": 4.96387959, "grad_norm": 0.52616251, "learning_rate": 6.45e-05, "elapsed_time_per_iteration": 6.47430348, "memory(GiB)": 21.51, "elapsed_time": "5h 8m 49s", "remaining_time": "6h 28m 23s", "loss_scale": 1.0, "consumed_samples": 721408, "global_step/max_steps": "2818/6362"} +{"lm loss": 4.96260786, "grad_norm": 0.61900914, "learning_rate": 6.448e-05, "elapsed_time_per_iteration": 6.4269278, "memory(GiB)": 21.51, "elapsed_time": "5h 8m 55s", "remaining_time": "6h 28m 16s", "loss_scale": 1.0, "consumed_samples": 721664, "global_step/max_steps": "2819/6362"} +{"lm loss": 4.95023727, "grad_norm": 0.59459692, "learning_rate": 6.445e-05, "elapsed_time_per_iteration": 6.36158514, "memory(GiB)": 21.51, "elapsed_time": "5h 9m 2s", "remaining_time": "6h 28m 9s", "loss_scale": 1.0, "consumed_samples": 721920, "global_step/max_steps": "2820/6362"} +{"lm loss": 4.93913078, "grad_norm": 0.60598087, "learning_rate": 6.443e-05, "elapsed_time_per_iteration": 6.72702861, "memory(GiB)": 21.51, "elapsed_time": "5h 9m 8s", "remaining_time": "6h 28m 3s", "loss_scale": 1.0, "consumed_samples": 722176, "global_step/max_steps": "2821/6362"} +{"lm loss": 4.97344542, "grad_norm": 0.65869731, "learning_rate": 6.44e-05, "elapsed_time_per_iteration": 6.67675853, "memory(GiB)": 21.51, "elapsed_time": "5h 9m 15s", "remaining_time": "6h 27m 56s", "loss_scale": 1.0, "consumed_samples": 722432, "global_step/max_steps": "2822/6362"} +{"lm loss": 4.95797873, "grad_norm": 0.58528417, "learning_rate": 6.438e-05, "elapsed_time_per_iteration": 6.39076734, "memory(GiB)": 21.51, "elapsed_time": "5h 9m 21s", "remaining_time": "6h 27m 49s", "loss_scale": 1.0, "consumed_samples": 722688, "global_step/max_steps": "2823/6362"} +{"lm loss": 4.93396139, "grad_norm": 0.57755369, "learning_rate": 6.435e-05, "elapsed_time_per_iteration": 6.46205592, "memory(GiB)": 21.51, "elapsed_time": "5h 9m 28s", "remaining_time": "6h 27m 43s", "loss_scale": 1.0, "consumed_samples": 722944, "global_step/max_steps": "2824/6362"} +{"lm loss": 4.93197536, "grad_norm": 0.55949128, "learning_rate": 6.433e-05, "elapsed_time_per_iteration": 6.57803917, "memory(GiB)": 21.51, "elapsed_time": "5h 9m 34s", "remaining_time": "6h 27m 36s", "loss_scale": 1.0, "consumed_samples": 723200, "global_step/max_steps": "2825/6362"} +{"lm loss": 4.96436214, "grad_norm": 0.54291117, "learning_rate": 6.431e-05, "elapsed_time_per_iteration": 6.63072562, "memory(GiB)": 21.51, "elapsed_time": "5h 9m 41s", "remaining_time": "6h 27m 29s", "loss_scale": 1.0, "consumed_samples": 723456, "global_step/max_steps": "2826/6362"} +{"lm loss": 4.94888973, "grad_norm": 0.56072223, "learning_rate": 6.428e-05, "elapsed_time_per_iteration": 6.55476952, "memory(GiB)": 21.51, "elapsed_time": "5h 9m 48s", "remaining_time": "6h 27m 23s", "loss_scale": 1.0, "consumed_samples": 723712, "global_step/max_steps": "2827/6362"} +{"lm loss": 4.96563578, "grad_norm": 0.55235612, "learning_rate": 6.426e-05, "elapsed_time_per_iteration": 6.6558671, "memory(GiB)": 21.51, "elapsed_time": "5h 9m 54s", "remaining_time": "6h 27m 16s", "loss_scale": 1.0, "consumed_samples": 723968, "global_step/max_steps": "2828/6362"} +{"lm loss": 4.94185305, "grad_norm": 0.63260543, "learning_rate": 6.423e-05, "elapsed_time_per_iteration": 6.7279985, "memory(GiB)": 21.51, "elapsed_time": "5h 10m 1s", "remaining_time": "6h 27m 10s", "loss_scale": 1.0, "consumed_samples": 724224, "global_step/max_steps": "2829/6362"} +{"lm loss": 4.95297527, "grad_norm": 0.60598046, "learning_rate": 6.421e-05, "elapsed_time_per_iteration": 6.33507228, "memory(GiB)": 21.51, "elapsed_time": "5h 10m 7s", "remaining_time": "6h 27m 3s", "loss_scale": 1.0, "consumed_samples": 724480, "global_step/max_steps": "2830/6362"} +{"lm loss": 4.96732092, "grad_norm": 0.57665211, "learning_rate": 6.418e-05, "elapsed_time_per_iteration": 6.64696956, "memory(GiB)": 21.51, "elapsed_time": "5h 10m 14s", "remaining_time": "6h 26m 57s", "loss_scale": 1.0, "consumed_samples": 724736, "global_step/max_steps": "2831/6362"} +{"lm loss": 4.95878172, "grad_norm": 0.59664094, "learning_rate": 6.416e-05, "elapsed_time_per_iteration": 6.64986396, "memory(GiB)": 21.51, "elapsed_time": "5h 10m 21s", "remaining_time": "6h 26m 50s", "loss_scale": 1.0, "consumed_samples": 724992, "global_step/max_steps": "2832/6362"} +{"lm loss": 4.94072151, "grad_norm": 0.5543679, "learning_rate": 6.414e-05, "elapsed_time_per_iteration": 6.59691167, "memory(GiB)": 21.51, "elapsed_time": "5h 10m 27s", "remaining_time": "6h 26m 44s", "loss_scale": 1.0, "consumed_samples": 725248, "global_step/max_steps": "2833/6362"} +{"lm loss": 4.94771671, "grad_norm": 0.65679711, "learning_rate": 6.411e-05, "elapsed_time_per_iteration": 6.75763655, "memory(GiB)": 21.51, "elapsed_time": "5h 10m 34s", "remaining_time": "6h 26m 37s", "loss_scale": 1.0, "consumed_samples": 725504, "global_step/max_steps": "2834/6362"} +{"lm loss": 4.93375874, "grad_norm": 0.6471718, "learning_rate": 6.409e-05, "elapsed_time_per_iteration": 6.63341999, "memory(GiB)": 21.51, "elapsed_time": "5h 10m 41s", "remaining_time": "6h 26m 31s", "loss_scale": 1.0, "consumed_samples": 725760, "global_step/max_steps": "2835/6362"} +{"lm loss": 4.95994711, "grad_norm": 0.6305514, "learning_rate": 6.406e-05, "elapsed_time_per_iteration": 6.52579784, "memory(GiB)": 21.51, "elapsed_time": "5h 10m 47s", "remaining_time": "6h 26m 24s", "loss_scale": 1.0, "consumed_samples": 726016, "global_step/max_steps": "2836/6362"} +{"lm loss": 4.95359373, "grad_norm": 0.67267537, "learning_rate": 6.404e-05, "elapsed_time_per_iteration": 6.70766234, "memory(GiB)": 21.51, "elapsed_time": "5h 10m 54s", "remaining_time": "6h 26m 18s", "loss_scale": 1.0, "consumed_samples": 726272, "global_step/max_steps": "2837/6362"} +{"lm loss": 4.9404459, "grad_norm": 0.65984499, "learning_rate": 6.401e-05, "elapsed_time_per_iteration": 6.56371427, "memory(GiB)": 21.51, "elapsed_time": "5h 11m 0s", "remaining_time": "6h 26m 11s", "loss_scale": 1.0, "consumed_samples": 726528, "global_step/max_steps": "2838/6362"} +{"lm loss": 4.95705175, "grad_norm": 0.55637854, "learning_rate": 6.399e-05, "elapsed_time_per_iteration": 6.54407096, "memory(GiB)": 21.51, "elapsed_time": "5h 11m 7s", "remaining_time": "6h 26m 5s", "loss_scale": 1.0, "consumed_samples": 726784, "global_step/max_steps": "2839/6362"} +{"lm loss": 4.96187639, "grad_norm": 0.56256557, "learning_rate": 6.396e-05, "elapsed_time_per_iteration": 6.6117425, "memory(GiB)": 21.51, "elapsed_time": "5h 11m 14s", "remaining_time": "6h 25m 58s", "loss_scale": 1.0, "consumed_samples": 727040, "global_step/max_steps": "2840/6362"} +{"lm loss": 4.94798422, "grad_norm": 0.51281101, "learning_rate": 6.394e-05, "elapsed_time_per_iteration": 6.72659111, "memory(GiB)": 21.51, "elapsed_time": "5h 11m 20s", "remaining_time": "6h 25m 52s", "loss_scale": 1.0, "consumed_samples": 727296, "global_step/max_steps": "2841/6362"} +{"lm loss": 4.9637332, "grad_norm": 0.54927582, "learning_rate": 6.392e-05, "elapsed_time_per_iteration": 6.46196795, "memory(GiB)": 21.51, "elapsed_time": "5h 11m 27s", "remaining_time": "6h 25m 45s", "loss_scale": 1.0, "consumed_samples": 727552, "global_step/max_steps": "2842/6362"} +{"lm loss": 4.95085144, "grad_norm": 0.56616205, "learning_rate": 6.389e-05, "elapsed_time_per_iteration": 6.64111972, "memory(GiB)": 21.51, "elapsed_time": "5h 11m 33s", "remaining_time": "6h 25m 38s", "loss_scale": 1.0, "consumed_samples": 727808, "global_step/max_steps": "2843/6362"} +{"lm loss": 4.95313311, "grad_norm": 0.56082666, "learning_rate": 6.387e-05, "elapsed_time_per_iteration": 6.54635739, "memory(GiB)": 21.51, "elapsed_time": "5h 11m 40s", "remaining_time": "6h 25m 32s", "loss_scale": 1.0, "consumed_samples": 728064, "global_step/max_steps": "2844/6362"} +{"lm loss": 4.98384905, "grad_norm": 0.57752609, "learning_rate": 6.384e-05, "elapsed_time_per_iteration": 6.66433334, "memory(GiB)": 21.51, "elapsed_time": "5h 11m 47s", "remaining_time": "6h 25m 25s", "loss_scale": 1.0, "consumed_samples": 728320, "global_step/max_steps": "2845/6362"} +{"lm loss": 4.94920874, "grad_norm": 0.58209527, "learning_rate": 6.382e-05, "elapsed_time_per_iteration": 6.67285633, "memory(GiB)": 21.51, "elapsed_time": "5h 11m 53s", "remaining_time": "6h 25m 19s", "loss_scale": 1.0, "consumed_samples": 728576, "global_step/max_steps": "2846/6362"} +{"lm loss": 4.95810556, "grad_norm": 0.51620984, "learning_rate": 6.379e-05, "elapsed_time_per_iteration": 6.51789904, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 0s", "remaining_time": "6h 25m 12s", "loss_scale": 1.0, "consumed_samples": 728832, "global_step/max_steps": "2847/6362"} +{"lm loss": 4.94889355, "grad_norm": 0.56578261, "learning_rate": 6.377e-05, "elapsed_time_per_iteration": 6.41138148, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 6s", "remaining_time": "6h 25m 5s", "loss_scale": 1.0, "consumed_samples": 729088, "global_step/max_steps": "2848/6362"} +{"lm loss": 4.94881058, "grad_norm": 0.53751713, "learning_rate": 6.375e-05, "elapsed_time_per_iteration": 6.54376388, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 13s", "remaining_time": "6h 24m 59s", "loss_scale": 1.0, "consumed_samples": 729344, "global_step/max_steps": "2849/6362"} +{"lm loss": 4.96433592, "grad_norm": 0.49726778, "learning_rate": 6.372e-05, "elapsed_time_per_iteration": 6.49293041, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 19s", "remaining_time": "6h 24m 52s", "loss_scale": 1.0, "consumed_samples": 729600, "global_step/max_steps": "2850/6362"} +{"lm loss": 4.93874073, "grad_norm": 0.49091035, "learning_rate": 6.37e-05, "elapsed_time_per_iteration": 6.52604771, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 26s", "remaining_time": "6h 24m 46s", "loss_scale": 1.0, "consumed_samples": 729856, "global_step/max_steps": "2851/6362"} +{"lm loss": 4.95797825, "grad_norm": 0.45144692, "learning_rate": 6.367e-05, "elapsed_time_per_iteration": 6.51861382, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 32s", "remaining_time": "6h 24m 39s", "loss_scale": 1.0, "consumed_samples": 730112, "global_step/max_steps": "2852/6362"} +{"lm loss": 4.93715382, "grad_norm": 0.51632446, "learning_rate": 6.365e-05, "elapsed_time_per_iteration": 6.76922202, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 39s", "remaining_time": "6h 24m 33s", "loss_scale": 1.0, "consumed_samples": 730368, "global_step/max_steps": "2853/6362"} +{"lm loss": 4.96104765, "grad_norm": 0.56101024, "learning_rate": 6.362e-05, "elapsed_time_per_iteration": 6.95025134, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 46s", "remaining_time": "6h 24m 26s", "loss_scale": 1.0, "consumed_samples": 730624, "global_step/max_steps": "2854/6362"} +{"lm loss": 4.94826508, "grad_norm": 0.51884049, "learning_rate": 6.36e-05, "elapsed_time_per_iteration": 6.61680913, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 53s", "remaining_time": "6h 24m 20s", "loss_scale": 1.0, "consumed_samples": 730880, "global_step/max_steps": "2855/6362"} +{"lm loss": 4.97464371, "grad_norm": 0.53132015, "learning_rate": 6.357e-05, "elapsed_time_per_iteration": 6.46198106, "memory(GiB)": 21.51, "elapsed_time": "5h 12m 59s", "remaining_time": "6h 24m 13s", "loss_scale": 1.0, "consumed_samples": 731136, "global_step/max_steps": "2856/6362"} +{"lm loss": 4.94904661, "grad_norm": 0.55401856, "learning_rate": 6.355e-05, "elapsed_time_per_iteration": 6.57522893, "memory(GiB)": 21.51, "elapsed_time": "5h 13m 6s", "remaining_time": "6h 24m 7s", "loss_scale": 1.0, "consumed_samples": 731392, "global_step/max_steps": "2857/6362"} +{"lm loss": 4.96593618, "grad_norm": 0.50800425, "learning_rate": 6.353e-05, "elapsed_time_per_iteration": 6.54901695, "memory(GiB)": 21.51, "elapsed_time": "5h 13m 12s", "remaining_time": "6h 24m 0s", "loss_scale": 1.0, "consumed_samples": 731648, "global_step/max_steps": "2858/6362"} +{"lm loss": 4.94119692, "grad_norm": 0.57147253, "learning_rate": 6.35e-05, "elapsed_time_per_iteration": 6.63704681, "memory(GiB)": 21.51, "elapsed_time": "5h 13m 19s", "remaining_time": "6h 23m 53s", "loss_scale": 1.0, "consumed_samples": 731904, "global_step/max_steps": "2859/6362"} +{"lm loss": 4.94976568, "grad_norm": 0.59241188, "learning_rate": 6.348e-05, "elapsed_time_per_iteration": 6.4918952, "memory(GiB)": 21.51, "elapsed_time": "5h 13m 25s", "remaining_time": "6h 23m 47s", "loss_scale": 1.0, "consumed_samples": 732160, "global_step/max_steps": "2860/6362"} +{"lm loss": 4.955441, "grad_norm": 0.64445025, "learning_rate": 6.345e-05, "elapsed_time_per_iteration": 6.5147953, "memory(GiB)": 21.51, "elapsed_time": "5h 13m 32s", "remaining_time": "6h 23m 40s", "loss_scale": 1.0, "consumed_samples": 732416, "global_step/max_steps": "2861/6362"} +{"lm loss": 4.93363762, "grad_norm": 0.79729337, "learning_rate": 6.343e-05, "elapsed_time_per_iteration": 6.51100755, "memory(GiB)": 21.51, "elapsed_time": "5h 13m 38s", "remaining_time": "6h 23m 34s", "loss_scale": 1.0, "consumed_samples": 732672, "global_step/max_steps": "2862/6362"} +{"lm loss": 4.96753645, "grad_norm": 0.76118964, "learning_rate": 6.34e-05, "elapsed_time_per_iteration": 6.50235724, "memory(GiB)": 21.51, "elapsed_time": "5h 13m 45s", "remaining_time": "6h 23m 27s", "loss_scale": 1.0, "consumed_samples": 732928, "global_step/max_steps": "2863/6362"} +{"lm loss": 4.94112253, "grad_norm": 0.6678645, "learning_rate": 6.338e-05, "elapsed_time_per_iteration": 6.66929054, "memory(GiB)": 21.51, "elapsed_time": "5h 13m 52s", "remaining_time": "6h 23m 20s", "loss_scale": 1.0, "consumed_samples": 733184, "global_step/max_steps": "2864/6362"} +{"lm loss": 4.98114061, "grad_norm": 0.57569021, "learning_rate": 6.335e-05, "elapsed_time_per_iteration": 6.50495172, "memory(GiB)": 21.51, "elapsed_time": "5h 13m 58s", "remaining_time": "6h 23m 14s", "loss_scale": 1.0, "consumed_samples": 733440, "global_step/max_steps": "2865/6362"} +{"lm loss": 4.95084047, "grad_norm": 0.60817671, "learning_rate": 6.333e-05, "elapsed_time_per_iteration": 6.58476591, "memory(GiB)": 21.51, "elapsed_time": "5h 14m 5s", "remaining_time": "6h 23m 7s", "loss_scale": 1.0, "consumed_samples": 733696, "global_step/max_steps": "2866/6362"} +{"lm loss": 4.96836662, "grad_norm": 0.61255616, "learning_rate": 6.331e-05, "elapsed_time_per_iteration": 6.76749706, "memory(GiB)": 21.51, "elapsed_time": "5h 14m 11s", "remaining_time": "6h 23m 1s", "loss_scale": 1.0, "consumed_samples": 733952, "global_step/max_steps": "2867/6362"} +{"lm loss": 4.93367672, "grad_norm": 0.53632182, "learning_rate": 6.328e-05, "elapsed_time_per_iteration": 6.4871943, "memory(GiB)": 21.51, "elapsed_time": "5h 14m 18s", "remaining_time": "6h 22m 54s", "loss_scale": 1.0, "consumed_samples": 734208, "global_step/max_steps": "2868/6362"} +{"lm loss": 4.97984791, "grad_norm": 0.63224536, "learning_rate": 6.326e-05, "elapsed_time_per_iteration": 6.56926155, "memory(GiB)": 21.51, "elapsed_time": "5h 14m 24s", "remaining_time": "6h 22m 48s", "loss_scale": 1.0, "consumed_samples": 734464, "global_step/max_steps": "2869/6362"} +{"lm loss": 4.94570112, "grad_norm": 0.62121511, "learning_rate": 6.323e-05, "elapsed_time_per_iteration": 6.62139297, "memory(GiB)": 21.51, "elapsed_time": "5h 14m 31s", "remaining_time": "6h 22m 41s", "loss_scale": 1.0, "consumed_samples": 734720, "global_step/max_steps": "2870/6362"} +{"lm loss": 4.94039774, "grad_norm": 0.6165269, "learning_rate": 6.321e-05, "elapsed_time_per_iteration": 6.65162659, "memory(GiB)": 21.51, "elapsed_time": "5h 14m 38s", "remaining_time": "6h 22m 35s", "loss_scale": 1.0, "consumed_samples": 734976, "global_step/max_steps": "2871/6362"} +{"lm loss": 4.96229553, "grad_norm": 0.59574515, "learning_rate": 6.318e-05, "elapsed_time_per_iteration": 6.57937884, "memory(GiB)": 21.51, "elapsed_time": "5h 14m 44s", "remaining_time": "6h 22m 28s", "loss_scale": 1.0, "consumed_samples": 735232, "global_step/max_steps": "2872/6362"} +{"lm loss": 4.95998955, "grad_norm": 0.58933794, "learning_rate": 6.316e-05, "elapsed_time_per_iteration": 6.40099573, "memory(GiB)": 21.51, "elapsed_time": "5h 14m 51s", "remaining_time": "6h 22m 21s", "loss_scale": 1.0, "consumed_samples": 735488, "global_step/max_steps": "2873/6362"} +{"lm loss": 4.94908381, "grad_norm": 0.645482, "learning_rate": 6.313e-05, "elapsed_time_per_iteration": 6.57971954, "memory(GiB)": 21.51, "elapsed_time": "5h 14m 57s", "remaining_time": "6h 22m 15s", "loss_scale": 1.0, "consumed_samples": 735744, "global_step/max_steps": "2874/6362"} +{"lm loss": 4.95313835, "grad_norm": 0.56054908, "learning_rate": 6.311e-05, "elapsed_time_per_iteration": 6.58940101, "memory(GiB)": 21.51, "elapsed_time": "5h 15m 4s", "remaining_time": "6h 22m 8s", "loss_scale": 1.0, "consumed_samples": 736000, "global_step/max_steps": "2875/6362"} +{"lm loss": 4.94404364, "grad_norm": 0.60893625, "learning_rate": 6.309e-05, "elapsed_time_per_iteration": 6.56380486, "memory(GiB)": 21.51, "elapsed_time": "5h 15m 10s", "remaining_time": "6h 22m 1s", "loss_scale": 1.0, "consumed_samples": 736256, "global_step/max_steps": "2876/6362"} +{"lm loss": 4.9415803, "grad_norm": 0.56626803, "learning_rate": 6.306e-05, "elapsed_time_per_iteration": 6.60563374, "memory(GiB)": 21.51, "elapsed_time": "5h 15m 17s", "remaining_time": "6h 21m 55s", "loss_scale": 1.0, "consumed_samples": 736512, "global_step/max_steps": "2877/6362"} +{"lm loss": 4.95406961, "grad_norm": 0.47183588, "learning_rate": 6.304e-05, "elapsed_time_per_iteration": 6.52354431, "memory(GiB)": 21.51, "elapsed_time": "5h 15m 24s", "remaining_time": "6h 21m 48s", "loss_scale": 1.0, "consumed_samples": 736768, "global_step/max_steps": "2878/6362"} +{"lm loss": 4.94388247, "grad_norm": 0.52141744, "learning_rate": 6.301e-05, "elapsed_time_per_iteration": 6.52709389, "memory(GiB)": 21.51, "elapsed_time": "5h 15m 30s", "remaining_time": "6h 21m 42s", "loss_scale": 1.0, "consumed_samples": 737024, "global_step/max_steps": "2879/6362"} +{"lm loss": 4.93955946, "grad_norm": 0.5256198, "learning_rate": 6.299e-05, "elapsed_time_per_iteration": 6.72032285, "memory(GiB)": 21.51, "elapsed_time": "5h 15m 37s", "remaining_time": "6h 21m 35s", "loss_scale": 1.0, "consumed_samples": 737280, "global_step/max_steps": "2880/6362"} +{"lm loss": 4.96134758, "grad_norm": 0.55472869, "learning_rate": 6.296e-05, "elapsed_time_per_iteration": 6.78140569, "memory(GiB)": 21.51, "elapsed_time": "5h 15m 44s", "remaining_time": "6h 21m 29s", "loss_scale": 1.0, "consumed_samples": 737536, "global_step/max_steps": "2881/6362"} +{"lm loss": 4.9480958, "grad_norm": 0.59426218, "learning_rate": 6.294e-05, "elapsed_time_per_iteration": 6.53177094, "memory(GiB)": 21.51, "elapsed_time": "5h 15m 50s", "remaining_time": "6h 21m 22s", "loss_scale": 1.0, "consumed_samples": 737792, "global_step/max_steps": "2882/6362"} +{"lm loss": 4.92337036, "grad_norm": 0.61062086, "learning_rate": 6.291e-05, "elapsed_time_per_iteration": 6.59248233, "memory(GiB)": 21.51, "elapsed_time": "5h 15m 57s", "remaining_time": "6h 21m 16s", "loss_scale": 1.0, "consumed_samples": 738048, "global_step/max_steps": "2883/6362"} +{"lm loss": 4.91799259, "grad_norm": 0.54838437, "learning_rate": 6.289e-05, "elapsed_time_per_iteration": 6.66652131, "memory(GiB)": 21.51, "elapsed_time": "5h 16m 3s", "remaining_time": "6h 21m 9s", "loss_scale": 1.0, "consumed_samples": 738304, "global_step/max_steps": "2884/6362"} +{"lm loss": 4.93063259, "grad_norm": 0.57541484, "learning_rate": 6.287e-05, "elapsed_time_per_iteration": 7.00981855, "memory(GiB)": 21.51, "elapsed_time": "5h 16m 10s", "remaining_time": "6h 21m 3s", "loss_scale": 1.0, "consumed_samples": 738560, "global_step/max_steps": "2885/6362"} +{"lm loss": 4.94879103, "grad_norm": 0.53537256, "learning_rate": 6.284e-05, "elapsed_time_per_iteration": 6.5685401, "memory(GiB)": 21.51, "elapsed_time": "5h 16m 17s", "remaining_time": "6h 20m 57s", "loss_scale": 1.0, "consumed_samples": 738816, "global_step/max_steps": "2886/6362"} +{"lm loss": 4.97308397, "grad_norm": 0.58962953, "learning_rate": 6.282e-05, "elapsed_time_per_iteration": 6.67000389, "memory(GiB)": 21.51, "elapsed_time": "5h 16m 24s", "remaining_time": "6h 20m 50s", "loss_scale": 1.0, "consumed_samples": 739072, "global_step/max_steps": "2887/6362"} +{"lm loss": 4.95152903, "grad_norm": 0.5444805, "learning_rate": 6.279e-05, "elapsed_time_per_iteration": 6.54724956, "memory(GiB)": 21.51, "elapsed_time": "5h 16m 30s", "remaining_time": "6h 20m 44s", "loss_scale": 1.0, "consumed_samples": 739328, "global_step/max_steps": "2888/6362"} +{"lm loss": 4.96791363, "grad_norm": 0.49512574, "learning_rate": 6.277e-05, "elapsed_time_per_iteration": 6.34984994, "memory(GiB)": 21.51, "elapsed_time": "5h 16m 37s", "remaining_time": "6h 20m 37s", "loss_scale": 1.0, "consumed_samples": 739584, "global_step/max_steps": "2889/6362"} +{"lm loss": 4.94341755, "grad_norm": 0.58177608, "learning_rate": 6.274e-05, "elapsed_time_per_iteration": 6.67171955, "memory(GiB)": 21.51, "elapsed_time": "5h 16m 43s", "remaining_time": "6h 20m 30s", "loss_scale": 1.0, "consumed_samples": 739840, "global_step/max_steps": "2890/6362"} +{"lm loss": 4.9366641, "grad_norm": 0.60492772, "learning_rate": 6.272e-05, "elapsed_time_per_iteration": 6.53087449, "memory(GiB)": 21.51, "elapsed_time": "5h 16m 50s", "remaining_time": "6h 20m 24s", "loss_scale": 1.0, "consumed_samples": 740096, "global_step/max_steps": "2891/6362"} +{"lm loss": 4.94758701, "grad_norm": 0.54782856, "learning_rate": 6.269e-05, "elapsed_time_per_iteration": 6.62234616, "memory(GiB)": 21.51, "elapsed_time": "5h 16m 56s", "remaining_time": "6h 20m 17s", "loss_scale": 1.0, "consumed_samples": 740352, "global_step/max_steps": "2892/6362"} +{"lm loss": 4.94849396, "grad_norm": 0.59735596, "learning_rate": 6.267e-05, "elapsed_time_per_iteration": 6.45552158, "memory(GiB)": 21.51, "elapsed_time": "5h 17m 3s", "remaining_time": "6h 20m 10s", "loss_scale": 1.0, "consumed_samples": 740608, "global_step/max_steps": "2893/6362"} +{"lm loss": 4.95737934, "grad_norm": 0.54508424, "learning_rate": 6.264e-05, "elapsed_time_per_iteration": 6.50192165, "memory(GiB)": 21.51, "elapsed_time": "5h 17m 9s", "remaining_time": "6h 20m 4s", "loss_scale": 1.0, "consumed_samples": 740864, "global_step/max_steps": "2894/6362"} +{"lm loss": 4.94543076, "grad_norm": 0.6128208, "learning_rate": 6.262e-05, "elapsed_time_per_iteration": 6.79795527, "memory(GiB)": 21.51, "elapsed_time": "5h 17m 16s", "remaining_time": "6h 19m 57s", "loss_scale": 1.0, "consumed_samples": 741120, "global_step/max_steps": "2895/6362"} +{"lm loss": 4.95252323, "grad_norm": 0.50904804, "learning_rate": 6.26e-05, "elapsed_time_per_iteration": 6.57803559, "memory(GiB)": 21.51, "elapsed_time": "5h 17m 23s", "remaining_time": "6h 19m 51s", "loss_scale": 1.0, "consumed_samples": 741376, "global_step/max_steps": "2896/6362"} +{"lm loss": 4.93962669, "grad_norm": 0.50111592, "learning_rate": 6.257e-05, "elapsed_time_per_iteration": 6.70527887, "memory(GiB)": 21.51, "elapsed_time": "5h 17m 29s", "remaining_time": "6h 19m 44s", "loss_scale": 1.0, "consumed_samples": 741632, "global_step/max_steps": "2897/6362"} +{"lm loss": 4.94711351, "grad_norm": 0.49271604, "learning_rate": 6.255e-05, "elapsed_time_per_iteration": 6.57790875, "memory(GiB)": 21.51, "elapsed_time": "5h 17m 36s", "remaining_time": "6h 19m 38s", "loss_scale": 1.0, "consumed_samples": 741888, "global_step/max_steps": "2898/6362"} +{"lm loss": 4.93806362, "grad_norm": 0.50170314, "learning_rate": 6.252e-05, "elapsed_time_per_iteration": 6.36996675, "memory(GiB)": 21.51, "elapsed_time": "5h 17m 42s", "remaining_time": "6h 19m 31s", "loss_scale": 1.0, "consumed_samples": 742144, "global_step/max_steps": "2899/6362"} +{"lm loss": 4.93341208, "grad_norm": 0.51746178, "learning_rate": 6.25e-05, "elapsed_time_per_iteration": 6.72112513, "memory(GiB)": 21.51, "elapsed_time": "5h 17m 49s", "remaining_time": "6h 19m 25s", "loss_scale": 1.0, "consumed_samples": 742400, "global_step/max_steps": "2900/6362"} +{"lm loss": 4.92782593, "grad_norm": 0.53353345, "learning_rate": 6.247e-05, "elapsed_time_per_iteration": 6.7476964, "memory(GiB)": 21.51, "elapsed_time": "5h 17m 56s", "remaining_time": "6h 19m 18s", "loss_scale": 1.0, "consumed_samples": 742656, "global_step/max_steps": "2901/6362"} +{"lm loss": 4.92728233, "grad_norm": 0.60796273, "learning_rate": 6.245e-05, "elapsed_time_per_iteration": 6.82379031, "memory(GiB)": 21.51, "elapsed_time": "5h 18m 3s", "remaining_time": "6h 19m 12s", "loss_scale": 1.0, "consumed_samples": 742912, "global_step/max_steps": "2902/6362"} +{"lm loss": 4.94593191, "grad_norm": 0.63053572, "learning_rate": 6.242e-05, "elapsed_time_per_iteration": 6.32065773, "memory(GiB)": 21.51, "elapsed_time": "5h 18m 9s", "remaining_time": "6h 19m 5s", "loss_scale": 1.0, "consumed_samples": 743168, "global_step/max_steps": "2903/6362"} +{"lm loss": 4.99093914, "grad_norm": 0.60469276, "learning_rate": 6.24e-05, "elapsed_time_per_iteration": 6.68878651, "memory(GiB)": 21.51, "elapsed_time": "5h 18m 16s", "remaining_time": "6h 18m 59s", "loss_scale": 1.0, "consumed_samples": 743424, "global_step/max_steps": "2904/6362"} +{"lm loss": 4.92787647, "grad_norm": 0.48145944, "learning_rate": 6.237e-05, "elapsed_time_per_iteration": 6.58008289, "memory(GiB)": 21.51, "elapsed_time": "5h 18m 22s", "remaining_time": "6h 18m 52s", "loss_scale": 1.0, "consumed_samples": 743680, "global_step/max_steps": "2905/6362"} +{"lm loss": 4.96412611, "grad_norm": 0.54848021, "learning_rate": 6.235e-05, "elapsed_time_per_iteration": 6.54092455, "memory(GiB)": 21.51, "elapsed_time": "5h 18m 29s", "remaining_time": "6h 18m 45s", "loss_scale": 1.0, "consumed_samples": 743936, "global_step/max_steps": "2906/6362"} +{"lm loss": 4.95105028, "grad_norm": 0.59743136, "learning_rate": 6.233e-05, "elapsed_time_per_iteration": 6.65078807, "memory(GiB)": 21.51, "elapsed_time": "5h 18m 35s", "remaining_time": "6h 18m 39s", "loss_scale": 1.0, "consumed_samples": 744192, "global_step/max_steps": "2907/6362"} +{"lm loss": 4.93928194, "grad_norm": 0.52995926, "learning_rate": 6.23e-05, "elapsed_time_per_iteration": 6.58996773, "memory(GiB)": 21.51, "elapsed_time": "5h 18m 42s", "remaining_time": "6h 18m 32s", "loss_scale": 1.0, "consumed_samples": 744448, "global_step/max_steps": "2908/6362"} +{"lm loss": 4.95495844, "grad_norm": 0.57874429, "learning_rate": 6.228e-05, "elapsed_time_per_iteration": 6.56349754, "memory(GiB)": 21.51, "elapsed_time": "5h 18m 49s", "remaining_time": "6h 18m 26s", "loss_scale": 1.0, "consumed_samples": 744704, "global_step/max_steps": "2909/6362"} +{"lm loss": 4.9609561, "grad_norm": 0.57700884, "learning_rate": 6.225e-05, "elapsed_time_per_iteration": 6.93401408, "memory(GiB)": 21.51, "elapsed_time": "5h 18m 56s", "remaining_time": "6h 18m 20s", "loss_scale": 1.0, "consumed_samples": 744960, "global_step/max_steps": "2910/6362"} +{"lm loss": 4.94799662, "grad_norm": 0.59018528, "learning_rate": 6.223e-05, "elapsed_time_per_iteration": 6.44311786, "memory(GiB)": 21.51, "elapsed_time": "5h 19m 2s", "remaining_time": "6h 18m 13s", "loss_scale": 1.0, "consumed_samples": 745216, "global_step/max_steps": "2911/6362"} +{"lm loss": 4.92763233, "grad_norm": 0.57891369, "learning_rate": 6.22e-05, "elapsed_time_per_iteration": 6.47451377, "memory(GiB)": 21.51, "elapsed_time": "5h 19m 8s", "remaining_time": "6h 18m 6s", "loss_scale": 1.0, "consumed_samples": 745472, "global_step/max_steps": "2912/6362"} +{"lm loss": 4.95007372, "grad_norm": 0.47072724, "learning_rate": 6.218e-05, "elapsed_time_per_iteration": 6.85321617, "memory(GiB)": 21.51, "elapsed_time": "5h 19m 15s", "remaining_time": "6h 18m 0s", "loss_scale": 1.0, "consumed_samples": 745728, "global_step/max_steps": "2913/6362"} +{"lm loss": 4.94300079, "grad_norm": 0.47493252, "learning_rate": 6.215e-05, "elapsed_time_per_iteration": 6.71299767, "memory(GiB)": 21.51, "elapsed_time": "5h 19m 22s", "remaining_time": "6h 17m 54s", "loss_scale": 1.0, "consumed_samples": 745984, "global_step/max_steps": "2914/6362"} +{"lm loss": 4.93095732, "grad_norm": 0.45539013, "learning_rate": 6.213e-05, "elapsed_time_per_iteration": 6.35734653, "memory(GiB)": 21.51, "elapsed_time": "5h 19m 28s", "remaining_time": "6h 17m 47s", "loss_scale": 1.0, "consumed_samples": 746240, "global_step/max_steps": "2915/6362"} +{"lm loss": 4.96626759, "grad_norm": 0.47468606, "learning_rate": 6.21e-05, "elapsed_time_per_iteration": 6.52818537, "memory(GiB)": 21.51, "elapsed_time": "5h 19m 35s", "remaining_time": "6h 17m 40s", "loss_scale": 1.0, "consumed_samples": 746496, "global_step/max_steps": "2916/6362"} +{"lm loss": 4.95790291, "grad_norm": 0.47685343, "learning_rate": 6.208e-05, "elapsed_time_per_iteration": 6.46794176, "memory(GiB)": 21.51, "elapsed_time": "5h 19m 41s", "remaining_time": "6h 17m 33s", "loss_scale": 1.0, "consumed_samples": 746752, "global_step/max_steps": "2917/6362"} +{"lm loss": 4.92253876, "grad_norm": 0.52280974, "learning_rate": 6.205e-05, "elapsed_time_per_iteration": 6.88908315, "memory(GiB)": 21.51, "elapsed_time": "5h 19m 48s", "remaining_time": "6h 17m 27s", "loss_scale": 1.0, "consumed_samples": 747008, "global_step/max_steps": "2918/6362"} +{"lm loss": 4.95501328, "grad_norm": 0.57780981, "learning_rate": 6.203e-05, "elapsed_time_per_iteration": 6.45727181, "memory(GiB)": 21.51, "elapsed_time": "5h 19m 55s", "remaining_time": "6h 17m 21s", "loss_scale": 1.0, "consumed_samples": 747264, "global_step/max_steps": "2919/6362"} +{"lm loss": 4.95354843, "grad_norm": 0.58923525, "learning_rate": 6.201e-05, "elapsed_time_per_iteration": 6.7498827, "memory(GiB)": 21.51, "elapsed_time": "5h 20m 1s", "remaining_time": "6h 17m 14s", "loss_scale": 1.0, "consumed_samples": 747520, "global_step/max_steps": "2920/6362"} +{"lm loss": 4.94313669, "grad_norm": 0.57546532, "learning_rate": 6.198e-05, "elapsed_time_per_iteration": 6.63702941, "memory(GiB)": 21.51, "elapsed_time": "5h 20m 8s", "remaining_time": "6h 17m 8s", "loss_scale": 1.0, "consumed_samples": 747776, "global_step/max_steps": "2921/6362"} +{"lm loss": 4.9423027, "grad_norm": 0.61844945, "learning_rate": 6.196e-05, "elapsed_time_per_iteration": 6.63341093, "memory(GiB)": 21.51, "elapsed_time": "5h 20m 15s", "remaining_time": "6h 17m 1s", "loss_scale": 1.0, "consumed_samples": 748032, "global_step/max_steps": "2922/6362"} +{"lm loss": 4.94428873, "grad_norm": 0.62448907, "learning_rate": 6.193e-05, "elapsed_time_per_iteration": 6.66180158, "memory(GiB)": 21.51, "elapsed_time": "5h 20m 21s", "remaining_time": "6h 16m 55s", "loss_scale": 1.0, "consumed_samples": 748288, "global_step/max_steps": "2923/6362"} +{"lm loss": 4.94755983, "grad_norm": 0.52371424, "learning_rate": 6.191e-05, "elapsed_time_per_iteration": 6.5495584, "memory(GiB)": 21.51, "elapsed_time": "5h 20m 28s", "remaining_time": "6h 16m 48s", "loss_scale": 1.0, "consumed_samples": 748544, "global_step/max_steps": "2924/6362"} +{"lm loss": 4.96893692, "grad_norm": 0.47574741, "learning_rate": 6.188e-05, "elapsed_time_per_iteration": 6.63441944, "memory(GiB)": 21.51, "elapsed_time": "5h 20m 35s", "remaining_time": "6h 16m 42s", "loss_scale": 1.0, "consumed_samples": 748800, "global_step/max_steps": "2925/6362"} +{"lm loss": 4.96716118, "grad_norm": 0.4712055, "learning_rate": 6.186e-05, "elapsed_time_per_iteration": 6.51761198, "memory(GiB)": 21.51, "elapsed_time": "5h 20m 41s", "remaining_time": "6h 16m 35s", "loss_scale": 1.0, "consumed_samples": 749056, "global_step/max_steps": "2926/6362"} +{"lm loss": 4.95693493, "grad_norm": 0.53837746, "learning_rate": 6.183e-05, "elapsed_time_per_iteration": 6.63084149, "memory(GiB)": 21.51, "elapsed_time": "5h 20m 48s", "remaining_time": "6h 16m 28s", "loss_scale": 1.0, "consumed_samples": 749312, "global_step/max_steps": "2927/6362"} +{"lm loss": 4.93241358, "grad_norm": 0.59522855, "learning_rate": 6.181e-05, "elapsed_time_per_iteration": 6.57772136, "memory(GiB)": 21.51, "elapsed_time": "5h 20m 54s", "remaining_time": "6h 16m 22s", "loss_scale": 1.0, "consumed_samples": 749568, "global_step/max_steps": "2928/6362"} +{"lm loss": 4.94016981, "grad_norm": 0.60255462, "learning_rate": 6.178e-05, "elapsed_time_per_iteration": 6.48129106, "memory(GiB)": 21.51, "elapsed_time": "5h 21m 1s", "remaining_time": "6h 16m 15s", "loss_scale": 1.0, "consumed_samples": 749824, "global_step/max_steps": "2929/6362"} +{"lm loss": 4.9557209, "grad_norm": 0.57358629, "learning_rate": 6.176e-05, "elapsed_time_per_iteration": 6.68628716, "memory(GiB)": 21.51, "elapsed_time": "5h 21m 7s", "remaining_time": "6h 16m 9s", "loss_scale": 1.0, "consumed_samples": 750080, "global_step/max_steps": "2930/6362"} +{"lm loss": 4.96276379, "grad_norm": 0.51502639, "learning_rate": 6.173e-05, "elapsed_time_per_iteration": 6.61175179, "memory(GiB)": 21.51, "elapsed_time": "5h 21m 14s", "remaining_time": "6h 16m 2s", "loss_scale": 1.0, "consumed_samples": 750336, "global_step/max_steps": "2931/6362"} +{"lm loss": 4.95032406, "grad_norm": 0.53279489, "learning_rate": 6.171e-05, "elapsed_time_per_iteration": 6.77490878, "memory(GiB)": 21.51, "elapsed_time": "5h 21m 21s", "remaining_time": "6h 15m 56s", "loss_scale": 1.0, "consumed_samples": 750592, "global_step/max_steps": "2932/6362"} +{"lm loss": 4.93596315, "grad_norm": 0.61050022, "learning_rate": 6.169e-05, "elapsed_time_per_iteration": 6.59316683, "memory(GiB)": 21.51, "elapsed_time": "5h 21m 27s", "remaining_time": "6h 15m 49s", "loss_scale": 1.0, "consumed_samples": 750848, "global_step/max_steps": "2933/6362"} +{"lm loss": 4.95027304, "grad_norm": 0.66798544, "learning_rate": 6.166e-05, "elapsed_time_per_iteration": 6.45699334, "memory(GiB)": 21.51, "elapsed_time": "5h 21m 34s", "remaining_time": "6h 15m 43s", "loss_scale": 1.0, "consumed_samples": 751104, "global_step/max_steps": "2934/6362"} +{"lm loss": 4.95718288, "grad_norm": 0.52529377, "learning_rate": 6.164e-05, "elapsed_time_per_iteration": 6.60695028, "memory(GiB)": 21.51, "elapsed_time": "5h 21m 41s", "remaining_time": "6h 15m 36s", "loss_scale": 1.0, "consumed_samples": 751360, "global_step/max_steps": "2935/6362"} +{"lm loss": 4.93185663, "grad_norm": 0.50001782, "learning_rate": 6.161e-05, "elapsed_time_per_iteration": 6.47386742, "memory(GiB)": 21.51, "elapsed_time": "5h 21m 47s", "remaining_time": "6h 15m 29s", "loss_scale": 1.0, "consumed_samples": 751616, "global_step/max_steps": "2936/6362"} +{"lm loss": 4.9524703, "grad_norm": 0.55457121, "learning_rate": 6.159e-05, "elapsed_time_per_iteration": 6.82210588, "memory(GiB)": 21.51, "elapsed_time": "5h 21m 54s", "remaining_time": "6h 15m 23s", "loss_scale": 1.0, "consumed_samples": 751872, "global_step/max_steps": "2937/6362"} +{"lm loss": 4.90898609, "grad_norm": 0.48126894, "learning_rate": 6.156e-05, "elapsed_time_per_iteration": 6.72205544, "memory(GiB)": 21.51, "elapsed_time": "5h 22m 1s", "remaining_time": "6h 15m 17s", "loss_scale": 1.0, "consumed_samples": 752128, "global_step/max_steps": "2938/6362"} +{"lm loss": 4.92733097, "grad_norm": 0.50132549, "learning_rate": 6.154e-05, "elapsed_time_per_iteration": 6.60863996, "memory(GiB)": 21.51, "elapsed_time": "5h 22m 7s", "remaining_time": "6h 15m 10s", "loss_scale": 1.0, "consumed_samples": 752384, "global_step/max_steps": "2939/6362"} +{"lm loss": 4.95090818, "grad_norm": 0.48855457, "learning_rate": 6.151e-05, "elapsed_time_per_iteration": 6.59728813, "memory(GiB)": 21.51, "elapsed_time": "5h 22m 14s", "remaining_time": "6h 15m 4s", "loss_scale": 1.0, "consumed_samples": 752640, "global_step/max_steps": "2940/6362"} +{"lm loss": 4.93686581, "grad_norm": 0.48362646, "learning_rate": 6.149e-05, "elapsed_time_per_iteration": 6.69217563, "memory(GiB)": 21.51, "elapsed_time": "5h 22m 20s", "remaining_time": "6h 14m 57s", "loss_scale": 1.0, "consumed_samples": 752896, "global_step/max_steps": "2941/6362"} +{"lm loss": 4.92636108, "grad_norm": 0.50088048, "learning_rate": 6.146e-05, "elapsed_time_per_iteration": 6.67989349, "memory(GiB)": 21.51, "elapsed_time": "5h 22m 27s", "remaining_time": "6h 14m 51s", "loss_scale": 1.0, "consumed_samples": 753152, "global_step/max_steps": "2942/6362"} +{"lm loss": 4.95432806, "grad_norm": 0.49891892, "learning_rate": 6.144e-05, "elapsed_time_per_iteration": 6.56169629, "memory(GiB)": 21.51, "elapsed_time": "5h 22m 34s", "remaining_time": "6h 14m 44s", "loss_scale": 1.0, "consumed_samples": 753408, "global_step/max_steps": "2943/6362"} +{"lm loss": 4.9460988, "grad_norm": 0.47013584, "learning_rate": 6.141e-05, "elapsed_time_per_iteration": 6.68715596, "memory(GiB)": 21.51, "elapsed_time": "5h 22m 40s", "remaining_time": "6h 14m 38s", "loss_scale": 1.0, "consumed_samples": 753664, "global_step/max_steps": "2944/6362"} +{"lm loss": 4.93622923, "grad_norm": 0.63926727, "learning_rate": 6.139e-05, "elapsed_time_per_iteration": 6.5088017, "memory(GiB)": 21.51, "elapsed_time": "5h 22m 47s", "remaining_time": "6h 14m 31s", "loss_scale": 1.0, "consumed_samples": 753920, "global_step/max_steps": "2945/6362"} +{"lm loss": 4.94362402, "grad_norm": 0.65951443, "learning_rate": 6.136e-05, "elapsed_time_per_iteration": 6.64168763, "memory(GiB)": 21.51, "elapsed_time": "5h 22m 54s", "remaining_time": "6h 14m 24s", "loss_scale": 1.0, "consumed_samples": 754176, "global_step/max_steps": "2946/6362"} +{"lm loss": 4.93704414, "grad_norm": 0.67925888, "learning_rate": 6.134e-05, "elapsed_time_per_iteration": 6.76942968, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 0s", "remaining_time": "6h 14m 18s", "loss_scale": 1.0, "consumed_samples": 754432, "global_step/max_steps": "2947/6362"} +{"lm loss": 4.92330265, "grad_norm": 0.53371745, "learning_rate": 6.132e-05, "elapsed_time_per_iteration": 6.5618453, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 7s", "remaining_time": "6h 14m 11s", "loss_scale": 1.0, "consumed_samples": 754688, "global_step/max_steps": "2948/6362"} +{"lm loss": 4.9539113, "grad_norm": 0.51099694, "learning_rate": 6.129e-05, "elapsed_time_per_iteration": 6.71368384, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 14s", "remaining_time": "6h 14m 5s", "loss_scale": 1.0, "consumed_samples": 754944, "global_step/max_steps": "2949/6362"} +{"lm loss": 4.96202517, "grad_norm": 0.61981636, "learning_rate": 6.127e-05, "elapsed_time_per_iteration": 6.57497478, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 20s", "remaining_time": "6h 13m 58s", "loss_scale": 1.0, "consumed_samples": 755200, "global_step/max_steps": "2950/6362"} +{"lm loss": 4.95035124, "grad_norm": 0.5648191, "learning_rate": 6.124e-05, "elapsed_time_per_iteration": 6.5669198, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 27s", "remaining_time": "6h 13m 52s", "loss_scale": 1.0, "consumed_samples": 755456, "global_step/max_steps": "2951/6362"} +{"lm loss": 4.92623711, "grad_norm": 0.52473247, "learning_rate": 6.122e-05, "elapsed_time_per_iteration": 6.48740435, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 33s", "remaining_time": "6h 13m 45s", "loss_scale": 1.0, "consumed_samples": 755712, "global_step/max_steps": "2952/6362"} +{"lm loss": 4.93893051, "grad_norm": 0.5182882, "learning_rate": 6.119e-05, "elapsed_time_per_iteration": 6.53532267, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 40s", "remaining_time": "6h 13m 39s", "loss_scale": 1.0, "consumed_samples": 755968, "global_step/max_steps": "2953/6362"} +{"lm loss": 4.95694065, "grad_norm": 0.52682281, "learning_rate": 6.117e-05, "elapsed_time_per_iteration": 6.42286587, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 46s", "remaining_time": "6h 13m 32s", "loss_scale": 1.0, "consumed_samples": 756224, "global_step/max_steps": "2954/6362"} +{"lm loss": 4.95556593, "grad_norm": 0.50503534, "learning_rate": 6.114e-05, "elapsed_time_per_iteration": 6.28980064, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 52s", "remaining_time": "6h 13m 25s", "loss_scale": 1.0, "consumed_samples": 756480, "global_step/max_steps": "2955/6362"} +{"lm loss": 4.95873785, "grad_norm": 0.55632979, "learning_rate": 6.112e-05, "elapsed_time_per_iteration": 6.63535285, "memory(GiB)": 21.51, "elapsed_time": "5h 23m 59s", "remaining_time": "6h 13m 18s", "loss_scale": 1.0, "consumed_samples": 756736, "global_step/max_steps": "2956/6362"} +{"lm loss": 4.93664885, "grad_norm": 0.54175782, "learning_rate": 6.109e-05, "elapsed_time_per_iteration": 6.57669759, "memory(GiB)": 21.51, "elapsed_time": "5h 24m 6s", "remaining_time": "6h 13m 12s", "loss_scale": 1.0, "consumed_samples": 756992, "global_step/max_steps": "2957/6362"} +{"lm loss": 4.9423008, "grad_norm": 0.61291438, "learning_rate": 6.107e-05, "elapsed_time_per_iteration": 6.34866691, "memory(GiB)": 21.51, "elapsed_time": "5h 24m 12s", "remaining_time": "6h 13m 5s", "loss_scale": 1.0, "consumed_samples": 757248, "global_step/max_steps": "2958/6362"} +{"lm loss": 4.94628382, "grad_norm": 0.66319263, "learning_rate": 6.104e-05, "elapsed_time_per_iteration": 6.56314754, "memory(GiB)": 21.51, "elapsed_time": "5h 24m 19s", "remaining_time": "6h 12m 58s", "loss_scale": 1.0, "consumed_samples": 757504, "global_step/max_steps": "2959/6362"} +{"lm loss": 4.94938707, "grad_norm": 0.61964762, "learning_rate": 6.102e-05, "elapsed_time_per_iteration": 6.78748608, "memory(GiB)": 21.51, "elapsed_time": "5h 24m 25s", "remaining_time": "6h 12m 52s", "loss_scale": 1.0, "consumed_samples": 757760, "global_step/max_steps": "2960/6362"} +{"lm loss": 4.94692326, "grad_norm": 0.54705608, "learning_rate": 6.099e-05, "elapsed_time_per_iteration": 6.59398818, "memory(GiB)": 21.51, "elapsed_time": "5h 24m 32s", "remaining_time": "6h 12m 46s", "loss_scale": 1.0, "consumed_samples": 758016, "global_step/max_steps": "2961/6362"} +{"lm loss": 4.9566679, "grad_norm": 0.50265127, "learning_rate": 6.097e-05, "elapsed_time_per_iteration": 6.67082787, "memory(GiB)": 21.51, "elapsed_time": "5h 24m 39s", "remaining_time": "6h 12m 39s", "loss_scale": 1.0, "consumed_samples": 758272, "global_step/max_steps": "2962/6362"} +{"lm loss": 4.95575237, "grad_norm": 0.62858033, "learning_rate": 6.094e-05, "elapsed_time_per_iteration": 6.88501644, "memory(GiB)": 21.51, "elapsed_time": "5h 24m 45s", "remaining_time": "6h 12m 33s", "loss_scale": 1.0, "consumed_samples": 758528, "global_step/max_steps": "2963/6362"} +{"lm loss": 4.94558954, "grad_norm": 0.62947732, "learning_rate": 6.092e-05, "elapsed_time_per_iteration": 6.73187852, "memory(GiB)": 21.51, "elapsed_time": "5h 24m 52s", "remaining_time": "6h 12m 26s", "loss_scale": 1.0, "consumed_samples": 758784, "global_step/max_steps": "2964/6362"} +{"lm loss": 4.95082521, "grad_norm": 0.56804508, "learning_rate": 6.09e-05, "elapsed_time_per_iteration": 6.59096384, "memory(GiB)": 21.51, "elapsed_time": "5h 24m 59s", "remaining_time": "6h 12m 20s", "loss_scale": 1.0, "consumed_samples": 759040, "global_step/max_steps": "2965/6362"} +{"lm loss": 4.95772839, "grad_norm": 0.49674389, "learning_rate": 6.087e-05, "elapsed_time_per_iteration": 6.511168, "memory(GiB)": 21.51, "elapsed_time": "5h 25m 5s", "remaining_time": "6h 12m 13s", "loss_scale": 1.0, "consumed_samples": 759296, "global_step/max_steps": "2966/6362"} +{"lm loss": 4.92079735, "grad_norm": 0.54017729, "learning_rate": 6.085e-05, "elapsed_time_per_iteration": 6.81560564, "memory(GiB)": 21.51, "elapsed_time": "5h 25m 12s", "remaining_time": "6h 12m 7s", "loss_scale": 1.0, "consumed_samples": 759552, "global_step/max_steps": "2967/6362"} +{"lm loss": 4.95328045, "grad_norm": 0.5236097, "learning_rate": 6.082e-05, "elapsed_time_per_iteration": 6.29357672, "memory(GiB)": 21.51, "elapsed_time": "5h 25m 18s", "remaining_time": "6h 12m 0s", "loss_scale": 1.0, "consumed_samples": 759808, "global_step/max_steps": "2968/6362"} +{"lm loss": 4.95988894, "grad_norm": 0.50185734, "learning_rate": 6.08e-05, "elapsed_time_per_iteration": 6.44111657, "memory(GiB)": 21.51, "elapsed_time": "5h 25m 25s", "remaining_time": "6h 11m 53s", "loss_scale": 1.0, "consumed_samples": 760064, "global_step/max_steps": "2969/6362"} +{"lm loss": 4.94594431, "grad_norm": 0.46493447, "learning_rate": 6.077e-05, "elapsed_time_per_iteration": 6.64213777, "memory(GiB)": 21.51, "elapsed_time": "5h 25m 32s", "remaining_time": "6h 11m 47s", "loss_scale": 1.0, "consumed_samples": 760320, "global_step/max_steps": "2970/6362"} +{"lm loss": 4.94320059, "grad_norm": 0.48624897, "learning_rate": 6.075e-05, "elapsed_time_per_iteration": 6.53529096, "memory(GiB)": 21.51, "elapsed_time": "5h 25m 38s", "remaining_time": "6h 11m 40s", "loss_scale": 1.0, "consumed_samples": 760576, "global_step/max_steps": "2971/6362"} +{"lm loss": 4.9821496, "grad_norm": 0.50851816, "learning_rate": 6.072e-05, "elapsed_time_per_iteration": 6.39715028, "memory(GiB)": 21.51, "elapsed_time": "5h 25m 44s", "remaining_time": "6h 11m 33s", "loss_scale": 1.0, "consumed_samples": 760832, "global_step/max_steps": "2972/6362"} +{"lm loss": 4.95303345, "grad_norm": 0.59283191, "learning_rate": 6.07e-05, "elapsed_time_per_iteration": 6.38822627, "memory(GiB)": 21.51, "elapsed_time": "5h 25m 51s", "remaining_time": "6h 11m 27s", "loss_scale": 1.0, "consumed_samples": 761088, "global_step/max_steps": "2973/6362"} +{"lm loss": 4.94288397, "grad_norm": 0.59143448, "learning_rate": 6.067e-05, "elapsed_time_per_iteration": 6.51248789, "memory(GiB)": 21.51, "elapsed_time": "5h 25m 57s", "remaining_time": "6h 11m 20s", "loss_scale": 1.0, "consumed_samples": 761344, "global_step/max_steps": "2974/6362"} +{"lm loss": 4.95391512, "grad_norm": 0.5284524, "learning_rate": 6.065e-05, "elapsed_time_per_iteration": 6.6033833, "memory(GiB)": 21.51, "elapsed_time": "5h 26m 4s", "remaining_time": "6h 11m 13s", "loss_scale": 1.0, "consumed_samples": 761600, "global_step/max_steps": "2975/6362"} +{"lm loss": 4.93183804, "grad_norm": 0.5879879, "learning_rate": 6.062e-05, "elapsed_time_per_iteration": 6.61468434, "memory(GiB)": 21.51, "elapsed_time": "5h 26m 11s", "remaining_time": "6h 11m 7s", "loss_scale": 1.0, "consumed_samples": 761856, "global_step/max_steps": "2976/6362"} +{"lm loss": 4.93264151, "grad_norm": 0.56083727, "learning_rate": 6.06e-05, "elapsed_time_per_iteration": 6.3551755, "memory(GiB)": 21.51, "elapsed_time": "5h 26m 17s", "remaining_time": "6h 11m 0s", "loss_scale": 1.0, "consumed_samples": 762112, "global_step/max_steps": "2977/6362"} +{"lm loss": 4.94410133, "grad_norm": 0.5661121, "learning_rate": 6.057e-05, "elapsed_time_per_iteration": 6.48144078, "memory(GiB)": 21.51, "elapsed_time": "5h 26m 23s", "remaining_time": "6h 10m 53s", "loss_scale": 1.0, "consumed_samples": 762368, "global_step/max_steps": "2978/6362"} +{"lm loss": 4.93572712, "grad_norm": 0.5890916, "learning_rate": 6.055e-05, "elapsed_time_per_iteration": 6.60954094, "memory(GiB)": 21.51, "elapsed_time": "5h 26m 30s", "remaining_time": "6h 10m 47s", "loss_scale": 1.0, "consumed_samples": 762624, "global_step/max_steps": "2979/6362"} +{"lm loss": 4.92717457, "grad_norm": 0.75778359, "learning_rate": 6.052e-05, "elapsed_time_per_iteration": 6.4566896, "memory(GiB)": 21.51, "elapsed_time": "5h 26m 36s", "remaining_time": "6h 10m 40s", "loss_scale": 1.0, "consumed_samples": 762880, "global_step/max_steps": "2980/6362"} +{"lm loss": 4.9447608, "grad_norm": 0.7244696, "learning_rate": 6.05e-05, "elapsed_time_per_iteration": 6.31743741, "memory(GiB)": 21.51, "elapsed_time": "5h 26m 43s", "remaining_time": "6h 10m 33s", "loss_scale": 1.0, "consumed_samples": 763136, "global_step/max_steps": "2981/6362"} +{"lm loss": 4.94601631, "grad_norm": 0.6451292, "learning_rate": 6.047e-05, "elapsed_time_per_iteration": 6.40162253, "memory(GiB)": 21.51, "elapsed_time": "5h 26m 49s", "remaining_time": "6h 10m 26s", "loss_scale": 1.0, "consumed_samples": 763392, "global_step/max_steps": "2982/6362"} +{"lm loss": 4.94799995, "grad_norm": 0.59211057, "learning_rate": 6.045e-05, "elapsed_time_per_iteration": 6.52171159, "memory(GiB)": 21.51, "elapsed_time": "5h 26m 56s", "remaining_time": "6h 10m 20s", "loss_scale": 1.0, "consumed_samples": 763648, "global_step/max_steps": "2983/6362"} +{"lm loss": 4.95929861, "grad_norm": 0.63079256, "learning_rate": 6.042e-05, "elapsed_time_per_iteration": 6.43219471, "memory(GiB)": 21.51, "elapsed_time": "5h 27m 2s", "remaining_time": "6h 10m 13s", "loss_scale": 1.0, "consumed_samples": 763904, "global_step/max_steps": "2984/6362"} +{"lm loss": 4.93727541, "grad_norm": 0.59476298, "learning_rate": 6.04e-05, "elapsed_time_per_iteration": 6.71219063, "memory(GiB)": 21.51, "elapsed_time": "5h 27m 9s", "remaining_time": "6h 10m 7s", "loss_scale": 1.0, "consumed_samples": 764160, "global_step/max_steps": "2985/6362"} +{"lm loss": 4.94045734, "grad_norm": 0.63144642, "learning_rate": 6.038e-05, "elapsed_time_per_iteration": 6.48717928, "memory(GiB)": 21.51, "elapsed_time": "5h 27m 15s", "remaining_time": "6h 10m 0s", "loss_scale": 1.0, "consumed_samples": 764416, "global_step/max_steps": "2986/6362"} +{"lm loss": 4.93381882, "grad_norm": 0.57350868, "learning_rate": 6.035e-05, "elapsed_time_per_iteration": 6.67919898, "memory(GiB)": 21.51, "elapsed_time": "5h 27m 22s", "remaining_time": "6h 9m 54s", "loss_scale": 1.0, "consumed_samples": 764672, "global_step/max_steps": "2987/6362"} +{"lm loss": 4.94003105, "grad_norm": 0.63685298, "learning_rate": 6.033e-05, "elapsed_time_per_iteration": 6.5649116, "memory(GiB)": 21.51, "elapsed_time": "5h 27m 29s", "remaining_time": "6h 9m 47s", "loss_scale": 1.0, "consumed_samples": 764928, "global_step/max_steps": "2988/6362"} +{"lm loss": 4.92037344, "grad_norm": 0.58808774, "learning_rate": 6.03e-05, "elapsed_time_per_iteration": 7.2897799, "memory(GiB)": 21.51, "elapsed_time": "5h 27m 36s", "remaining_time": "6h 9m 41s", "loss_scale": 1.0, "consumed_samples": 765184, "global_step/max_steps": "2989/6362"} +{"lm loss": 4.92773867, "grad_norm": 0.56128764, "learning_rate": 6.028e-05, "elapsed_time_per_iteration": 6.54238772, "memory(GiB)": 21.51, "elapsed_time": "5h 27m 42s", "remaining_time": "6h 9m 35s", "loss_scale": 1.0, "consumed_samples": 765440, "global_step/max_steps": "2990/6362"} +{"lm loss": 4.95299244, "grad_norm": 0.54682952, "learning_rate": 6.025e-05, "elapsed_time_per_iteration": 6.53876519, "memory(GiB)": 21.51, "elapsed_time": "5h 27m 49s", "remaining_time": "6h 9m 28s", "loss_scale": 1.0, "consumed_samples": 765696, "global_step/max_steps": "2991/6362"} +{"lm loss": 4.94307899, "grad_norm": 0.50715166, "learning_rate": 6.023e-05, "elapsed_time_per_iteration": 6.52235603, "memory(GiB)": 21.51, "elapsed_time": "5h 27m 55s", "remaining_time": "6h 9m 21s", "loss_scale": 1.0, "consumed_samples": 765952, "global_step/max_steps": "2992/6362"} +{"lm loss": 4.9674859, "grad_norm": 0.52949268, "learning_rate": 6.02e-05, "elapsed_time_per_iteration": 6.53096104, "memory(GiB)": 21.51, "elapsed_time": "5h 28m 2s", "remaining_time": "6h 9m 15s", "loss_scale": 1.0, "consumed_samples": 766208, "global_step/max_steps": "2993/6362"} +{"lm loss": 4.95540714, "grad_norm": 0.43002552, "learning_rate": 6.018e-05, "elapsed_time_per_iteration": 6.58702469, "memory(GiB)": 21.51, "elapsed_time": "5h 28m 9s", "remaining_time": "6h 9m 8s", "loss_scale": 1.0, "consumed_samples": 766464, "global_step/max_steps": "2994/6362"} +{"lm loss": 4.93156052, "grad_norm": 0.55077237, "learning_rate": 6.015e-05, "elapsed_time_per_iteration": 6.59945059, "memory(GiB)": 21.51, "elapsed_time": "5h 28m 15s", "remaining_time": "6h 9m 2s", "loss_scale": 1.0, "consumed_samples": 766720, "global_step/max_steps": "2995/6362"} +{"lm loss": 4.93582201, "grad_norm": 0.58064044, "learning_rate": 6.013e-05, "elapsed_time_per_iteration": 6.25134873, "memory(GiB)": 21.51, "elapsed_time": "5h 28m 21s", "remaining_time": "6h 8m 55s", "loss_scale": 1.0, "consumed_samples": 766976, "global_step/max_steps": "2996/6362"} +{"lm loss": 4.946455, "grad_norm": 0.48555341, "learning_rate": 6.01e-05, "elapsed_time_per_iteration": 6.62724328, "memory(GiB)": 21.51, "elapsed_time": "5h 28m 28s", "remaining_time": "6h 8m 48s", "loss_scale": 1.0, "consumed_samples": 767232, "global_step/max_steps": "2997/6362"} +{"lm loss": 4.93742657, "grad_norm": 0.48813018, "learning_rate": 6.008e-05, "elapsed_time_per_iteration": 6.56593752, "memory(GiB)": 21.51, "elapsed_time": "5h 28m 35s", "remaining_time": "6h 8m 41s", "loss_scale": 1.0, "consumed_samples": 767488, "global_step/max_steps": "2998/6362"} +{"lm loss": 4.93925047, "grad_norm": 0.48701447, "learning_rate": 6.005e-05, "elapsed_time_per_iteration": 6.30841732, "memory(GiB)": 21.51, "elapsed_time": "5h 28m 41s", "remaining_time": "6h 8m 35s", "loss_scale": 1.0, "consumed_samples": 767744, "global_step/max_steps": "2999/6362"} +{"lm loss": 4.92908669, "grad_norm": 0.4806501, "learning_rate": 6.003e-05, "elapsed_time_per_iteration": 6.66669011, "memory(GiB)": 21.51, "elapsed_time": "5h 28m 48s", "remaining_time": "6h 8m 28s", "loss_scale": 1.0, "consumed_samples": 768000, "global_step/max_steps": "3000/6362"} +{"lm loss": 4.92929506, "grad_norm": 0.49493417, "learning_rate": 6e-05, "elapsed_time_per_iteration": 6.7756176, "memory(GiB)": 21.51, "elapsed_time": "5h 28m 54s", "remaining_time": "6h 8m 22s", "loss_scale": 1.0, "consumed_samples": 768256, "global_step/max_steps": "3001/6362"} +{"lm loss": 4.93841314, "grad_norm": 0.46589771, "learning_rate": 5.998e-05, "elapsed_time_per_iteration": 6.45191932, "memory(GiB)": 21.51, "elapsed_time": "5h 29m 1s", "remaining_time": "6h 8m 15s", "loss_scale": 1.0, "consumed_samples": 768512, "global_step/max_steps": "3002/6362"} +{"lm loss": 4.95984697, "grad_norm": 0.47442931, "learning_rate": 5.995e-05, "elapsed_time_per_iteration": 6.59944344, "memory(GiB)": 21.51, "elapsed_time": "5h 29m 7s", "remaining_time": "6h 8m 9s", "loss_scale": 1.0, "consumed_samples": 768768, "global_step/max_steps": "3003/6362"} +{"lm loss": 4.92445755, "grad_norm": 0.5009141, "learning_rate": 5.993e-05, "elapsed_time_per_iteration": 6.74979472, "memory(GiB)": 21.51, "elapsed_time": "5h 29m 14s", "remaining_time": "6h 8m 2s", "loss_scale": 1.0, "consumed_samples": 769024, "global_step/max_steps": "3004/6362"} +{"lm loss": 4.93715763, "grad_norm": 0.46437284, "learning_rate": 5.99e-05, "elapsed_time_per_iteration": 6.644243, "memory(GiB)": 21.51, "elapsed_time": "5h 29m 21s", "remaining_time": "6h 7m 56s", "loss_scale": 1.0, "consumed_samples": 769280, "global_step/max_steps": "3005/6362"} +{"lm loss": 4.95151186, "grad_norm": 0.43741849, "learning_rate": 5.988e-05, "elapsed_time_per_iteration": 6.68332863, "memory(GiB)": 21.51, "elapsed_time": "5h 29m 28s", "remaining_time": "6h 7m 49s", "loss_scale": 1.0, "consumed_samples": 769536, "global_step/max_steps": "3006/6362"} +{"lm loss": 4.95190334, "grad_norm": 0.46230084, "learning_rate": 5.985e-05, "elapsed_time_per_iteration": 6.24303889, "memory(GiB)": 21.51, "elapsed_time": "5h 29m 34s", "remaining_time": "6h 7m 42s", "loss_scale": 1.0, "consumed_samples": 769792, "global_step/max_steps": "3007/6362"} +{"lm loss": 4.9498024, "grad_norm": 0.52407587, "learning_rate": 5.983e-05, "elapsed_time_per_iteration": 6.45301533, "memory(GiB)": 21.51, "elapsed_time": "5h 29m 40s", "remaining_time": "6h 7m 36s", "loss_scale": 1.0, "consumed_samples": 770048, "global_step/max_steps": "3008/6362"} +{"lm loss": 4.96368551, "grad_norm": 0.63511342, "learning_rate": 5.98e-05, "elapsed_time_per_iteration": 6.75035167, "memory(GiB)": 21.51, "elapsed_time": "5h 29m 47s", "remaining_time": "6h 7m 29s", "loss_scale": 1.0, "consumed_samples": 770304, "global_step/max_steps": "3009/6362"} +{"lm loss": 4.95376444, "grad_norm": 0.59214079, "learning_rate": 5.978e-05, "elapsed_time_per_iteration": 6.73735118, "memory(GiB)": 21.51, "elapsed_time": "5h 29m 54s", "remaining_time": "6h 7m 23s", "loss_scale": 1.0, "consumed_samples": 770560, "global_step/max_steps": "3010/6362"} +{"lm loss": 4.9357295, "grad_norm": 0.53919977, "learning_rate": 5.975e-05, "elapsed_time_per_iteration": 6.44389296, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 0s", "remaining_time": "6h 7m 16s", "loss_scale": 1.0, "consumed_samples": 770816, "global_step/max_steps": "3011/6362"} +{"lm loss": 4.92177248, "grad_norm": 0.51637042, "learning_rate": 5.973e-05, "elapsed_time_per_iteration": 6.49004722, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 7s", "remaining_time": "6h 7m 9s", "loss_scale": 1.0, "consumed_samples": 771072, "global_step/max_steps": "3012/6362"} +{"lm loss": 4.95698071, "grad_norm": 0.55901074, "learning_rate": 5.971e-05, "elapsed_time_per_iteration": 6.48962355, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 13s", "remaining_time": "6h 7m 3s", "loss_scale": 1.0, "consumed_samples": 771328, "global_step/max_steps": "3013/6362"} +{"lm loss": 4.94151449, "grad_norm": 0.65295434, "learning_rate": 5.968e-05, "elapsed_time_per_iteration": 6.2444129, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 19s", "remaining_time": "6h 6m 56s", "loss_scale": 1.0, "consumed_samples": 771584, "global_step/max_steps": "3014/6362"} +{"lm loss": 4.9471159, "grad_norm": 0.59706551, "learning_rate": 5.966e-05, "elapsed_time_per_iteration": 6.57634568, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 26s", "remaining_time": "6h 6m 49s", "loss_scale": 1.0, "consumed_samples": 771840, "global_step/max_steps": "3015/6362"} +{"lm loss": 4.96428156, "grad_norm": 0.48915434, "learning_rate": 5.963e-05, "elapsed_time_per_iteration": 6.40333247, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 32s", "remaining_time": "6h 6m 42s", "loss_scale": 1.0, "consumed_samples": 772096, "global_step/max_steps": "3016/6362"} +{"lm loss": 4.93787384, "grad_norm": 0.53939831, "learning_rate": 5.961e-05, "elapsed_time_per_iteration": 6.45400834, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 39s", "remaining_time": "6h 6m 36s", "loss_scale": 1.0, "consumed_samples": 772352, "global_step/max_steps": "3017/6362"} +{"lm loss": 4.96759748, "grad_norm": 0.60569346, "learning_rate": 5.958e-05, "elapsed_time_per_iteration": 6.5139482, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 45s", "remaining_time": "6h 6m 29s", "loss_scale": 1.0, "consumed_samples": 772608, "global_step/max_steps": "3018/6362"} +{"lm loss": 4.90897083, "grad_norm": 0.6517235, "learning_rate": 5.956e-05, "elapsed_time_per_iteration": 6.55006742, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 52s", "remaining_time": "6h 6m 22s", "loss_scale": 1.0, "consumed_samples": 772864, "global_step/max_steps": "3019/6362"} +{"lm loss": 4.94748497, "grad_norm": 0.60266238, "learning_rate": 5.953e-05, "elapsed_time_per_iteration": 6.46218371, "memory(GiB)": 21.51, "elapsed_time": "5h 30m 58s", "remaining_time": "6h 6m 16s", "loss_scale": 1.0, "consumed_samples": 773120, "global_step/max_steps": "3020/6362"} +{"lm loss": 4.94524765, "grad_norm": 0.60647547, "learning_rate": 5.951e-05, "elapsed_time_per_iteration": 6.44192743, "memory(GiB)": 21.51, "elapsed_time": "5h 31m 5s", "remaining_time": "6h 6m 9s", "loss_scale": 1.0, "consumed_samples": 773376, "global_step/max_steps": "3021/6362"} +{"lm loss": 4.95541573, "grad_norm": 0.54407537, "learning_rate": 5.948e-05, "elapsed_time_per_iteration": 6.23147941, "memory(GiB)": 21.51, "elapsed_time": "5h 31m 11s", "remaining_time": "6h 6m 2s", "loss_scale": 1.0, "consumed_samples": 773632, "global_step/max_steps": "3022/6362"} +{"lm loss": 4.94861841, "grad_norm": 0.55866438, "learning_rate": 5.946e-05, "elapsed_time_per_iteration": 6.79324317, "memory(GiB)": 21.51, "elapsed_time": "5h 31m 18s", "remaining_time": "6h 5m 56s", "loss_scale": 1.0, "consumed_samples": 773888, "global_step/max_steps": "3023/6362"} +{"lm loss": 4.96180105, "grad_norm": 0.52556735, "learning_rate": 5.943e-05, "elapsed_time_per_iteration": 6.73435473, "memory(GiB)": 21.51, "elapsed_time": "5h 31m 25s", "remaining_time": "6h 5m 49s", "loss_scale": 1.0, "consumed_samples": 774144, "global_step/max_steps": "3024/6362"} +{"lm loss": 4.95281792, "grad_norm": 0.57301974, "learning_rate": 5.941e-05, "elapsed_time_per_iteration": 6.47129536, "memory(GiB)": 21.51, "elapsed_time": "5h 31m 31s", "remaining_time": "6h 5m 43s", "loss_scale": 1.0, "consumed_samples": 774400, "global_step/max_steps": "3025/6362"} +{"lm loss": 4.94780493, "grad_norm": 0.53074741, "learning_rate": 5.938e-05, "elapsed_time_per_iteration": 6.51575851, "memory(GiB)": 21.51, "elapsed_time": "5h 31m 38s", "remaining_time": "6h 5m 36s", "loss_scale": 1.0, "consumed_samples": 774656, "global_step/max_steps": "3026/6362"} +{"lm loss": 4.96064758, "grad_norm": 0.56908721, "learning_rate": 5.936e-05, "elapsed_time_per_iteration": 6.53397465, "memory(GiB)": 21.51, "elapsed_time": "5h 31m 44s", "remaining_time": "6h 5m 29s", "loss_scale": 1.0, "consumed_samples": 774912, "global_step/max_steps": "3027/6362"} +{"lm loss": 4.94205999, "grad_norm": 0.51171392, "learning_rate": 5.933e-05, "elapsed_time_per_iteration": 6.81587434, "memory(GiB)": 21.51, "elapsed_time": "5h 31m 51s", "remaining_time": "6h 5m 23s", "loss_scale": 1.0, "consumed_samples": 775168, "global_step/max_steps": "3028/6362"} +{"lm loss": 4.94397259, "grad_norm": 0.48739824, "learning_rate": 5.931e-05, "elapsed_time_per_iteration": 6.76062775, "memory(GiB)": 21.51, "elapsed_time": "5h 31m 58s", "remaining_time": "6h 5m 17s", "loss_scale": 1.0, "consumed_samples": 775424, "global_step/max_steps": "3029/6362"} +{"lm loss": 4.93043947, "grad_norm": 0.52558982, "learning_rate": 5.928e-05, "elapsed_time_per_iteration": 6.55157351, "memory(GiB)": 21.51, "elapsed_time": "5h 32m 4s", "remaining_time": "6h 5m 10s", "loss_scale": 1.0, "consumed_samples": 775680, "global_step/max_steps": "3030/6362"} +{"lm loss": 4.93690014, "grad_norm": 0.49129853, "learning_rate": 5.926e-05, "elapsed_time_per_iteration": 6.69808412, "memory(GiB)": 21.51, "elapsed_time": "5h 32m 11s", "remaining_time": "6h 5m 4s", "loss_scale": 1.0, "consumed_samples": 775936, "global_step/max_steps": "3031/6362"} +{"lm loss": 4.95071697, "grad_norm": 0.58114773, "learning_rate": 5.923e-05, "elapsed_time_per_iteration": 6.86810803, "memory(GiB)": 21.51, "elapsed_time": "5h 32m 18s", "remaining_time": "6h 4m 57s", "loss_scale": 1.0, "consumed_samples": 776192, "global_step/max_steps": "3032/6362"} +{"lm loss": 4.95456457, "grad_norm": 0.56819659, "learning_rate": 5.921e-05, "elapsed_time_per_iteration": 6.61042333, "memory(GiB)": 21.51, "elapsed_time": "5h 32m 24s", "remaining_time": "6h 4m 51s", "loss_scale": 1.0, "consumed_samples": 776448, "global_step/max_steps": "3033/6362"} +{"lm loss": 4.96623421, "grad_norm": 0.53355151, "learning_rate": 5.918e-05, "elapsed_time_per_iteration": 6.55058551, "memory(GiB)": 21.51, "elapsed_time": "5h 32m 31s", "remaining_time": "6h 4m 44s", "loss_scale": 1.0, "consumed_samples": 776704, "global_step/max_steps": "3034/6362"} +{"lm loss": 4.92877579, "grad_norm": 0.54918152, "learning_rate": 5.916e-05, "elapsed_time_per_iteration": 6.60553837, "memory(GiB)": 21.51, "elapsed_time": "5h 32m 38s", "remaining_time": "6h 4m 38s", "loss_scale": 1.0, "consumed_samples": 776960, "global_step/max_steps": "3035/6362"} +{"lm loss": 4.92477751, "grad_norm": 0.49937984, "learning_rate": 5.913e-05, "elapsed_time_per_iteration": 6.55283475, "memory(GiB)": 21.51, "elapsed_time": "5h 32m 44s", "remaining_time": "6h 4m 31s", "loss_scale": 1.0, "consumed_samples": 777216, "global_step/max_steps": "3036/6362"} +{"lm loss": 4.96337938, "grad_norm": 0.50938439, "learning_rate": 5.911e-05, "elapsed_time_per_iteration": 6.55491853, "memory(GiB)": 21.51, "elapsed_time": "5h 32m 51s", "remaining_time": "6h 4m 24s", "loss_scale": 1.0, "consumed_samples": 777472, "global_step/max_steps": "3037/6362"} +{"lm loss": 4.92191315, "grad_norm": 0.47885022, "learning_rate": 5.908e-05, "elapsed_time_per_iteration": 6.48000765, "memory(GiB)": 21.51, "elapsed_time": "5h 32m 57s", "remaining_time": "6h 4m 18s", "loss_scale": 1.0, "consumed_samples": 777728, "global_step/max_steps": "3038/6362"} +{"lm loss": 4.93336105, "grad_norm": 0.53535926, "learning_rate": 5.906e-05, "elapsed_time_per_iteration": 6.70472336, "memory(GiB)": 21.51, "elapsed_time": "5h 33m 4s", "remaining_time": "6h 4m 11s", "loss_scale": 1.0, "consumed_samples": 777984, "global_step/max_steps": "3039/6362"} +{"lm loss": 4.95952988, "grad_norm": 0.56749922, "learning_rate": 5.903e-05, "elapsed_time_per_iteration": 6.99659443, "memory(GiB)": 21.51, "elapsed_time": "5h 33m 11s", "remaining_time": "6h 4m 5s", "loss_scale": 1.0, "consumed_samples": 778240, "global_step/max_steps": "3040/6362"} +{"lm loss": 4.93726015, "grad_norm": 0.50985825, "learning_rate": 5.901e-05, "elapsed_time_per_iteration": 6.7425704, "memory(GiB)": 21.51, "elapsed_time": "5h 33m 18s", "remaining_time": "6h 3m 59s", "loss_scale": 1.0, "consumed_samples": 778496, "global_step/max_steps": "3041/6362"} +{"lm loss": 4.9302187, "grad_norm": 0.44488806, "learning_rate": 5.898e-05, "elapsed_time_per_iteration": 6.62819219, "memory(GiB)": 21.51, "elapsed_time": "5h 33m 24s", "remaining_time": "6h 3m 52s", "loss_scale": 1.0, "consumed_samples": 778752, "global_step/max_steps": "3042/6362"} +{"lm loss": 4.92301226, "grad_norm": 0.47272328, "learning_rate": 5.896e-05, "elapsed_time_per_iteration": 6.61633563, "memory(GiB)": 21.51, "elapsed_time": "5h 33m 31s", "remaining_time": "6h 3m 46s", "loss_scale": 1.0, "consumed_samples": 779008, "global_step/max_steps": "3043/6362"} +{"lm loss": 4.94475937, "grad_norm": 0.5245924, "learning_rate": 5.893e-05, "elapsed_time_per_iteration": 6.60261011, "memory(GiB)": 21.51, "elapsed_time": "5h 33m 37s", "remaining_time": "6h 3m 39s", "loss_scale": 1.0, "consumed_samples": 779264, "global_step/max_steps": "3044/6362"} +{"lm loss": 4.95565128, "grad_norm": 0.53876573, "learning_rate": 5.891e-05, "elapsed_time_per_iteration": 6.69104171, "memory(GiB)": 21.51, "elapsed_time": "5h 33m 44s", "remaining_time": "6h 3m 33s", "loss_scale": 1.0, "consumed_samples": 779520, "global_step/max_steps": "3045/6362"} +{"lm loss": 4.94644547, "grad_norm": 0.59247732, "learning_rate": 5.888e-05, "elapsed_time_per_iteration": 6.49936223, "memory(GiB)": 21.51, "elapsed_time": "5h 33m 51s", "remaining_time": "6h 3m 26s", "loss_scale": 1.0, "consumed_samples": 779776, "global_step/max_steps": "3046/6362"} +{"lm loss": 4.93475819, "grad_norm": 0.56499839, "learning_rate": 5.886e-05, "elapsed_time_per_iteration": 6.69081402, "memory(GiB)": 21.51, "elapsed_time": "5h 33m 57s", "remaining_time": "6h 3m 20s", "loss_scale": 1.0, "consumed_samples": 780032, "global_step/max_steps": "3047/6362"} +{"lm loss": 4.94456053, "grad_norm": 0.53573161, "learning_rate": 5.883e-05, "elapsed_time_per_iteration": 6.57233953, "memory(GiB)": 21.51, "elapsed_time": "5h 34m 4s", "remaining_time": "6h 3m 13s", "loss_scale": 1.0, "consumed_samples": 780288, "global_step/max_steps": "3048/6362"} +{"lm loss": 4.92035341, "grad_norm": 0.51321024, "learning_rate": 5.881e-05, "elapsed_time_per_iteration": 6.90071893, "memory(GiB)": 21.51, "elapsed_time": "5h 34m 11s", "remaining_time": "6h 3m 7s", "loss_scale": 1.0, "consumed_samples": 780544, "global_step/max_steps": "3049/6362"} +{"lm loss": 4.95047951, "grad_norm": 0.56047511, "learning_rate": 5.878e-05, "elapsed_time_per_iteration": 6.6830864, "memory(GiB)": 21.51, "elapsed_time": "5h 34m 17s", "remaining_time": "6h 3m 0s", "loss_scale": 1.0, "consumed_samples": 780800, "global_step/max_steps": "3050/6362"} +{"lm loss": 4.93890333, "grad_norm": 0.58645445, "learning_rate": 5.876e-05, "elapsed_time_per_iteration": 6.37595844, "memory(GiB)": 21.51, "elapsed_time": "5h 34m 24s", "remaining_time": "6h 2m 54s", "loss_scale": 1.0, "consumed_samples": 781056, "global_step/max_steps": "3051/6362"} +{"lm loss": 4.95272064, "grad_norm": 0.55899662, "learning_rate": 5.873e-05, "elapsed_time_per_iteration": 6.30958867, "memory(GiB)": 21.51, "elapsed_time": "5h 34m 30s", "remaining_time": "6h 2m 47s", "loss_scale": 1.0, "consumed_samples": 781312, "global_step/max_steps": "3052/6362"} +{"lm loss": 4.9593854, "grad_norm": 0.54914463, "learning_rate": 5.871e-05, "elapsed_time_per_iteration": 6.61248851, "memory(GiB)": 21.51, "elapsed_time": "5h 34m 37s", "remaining_time": "6h 2m 40s", "loss_scale": 1.0, "consumed_samples": 781568, "global_step/max_steps": "3053/6362"} +{"lm loss": 4.94825268, "grad_norm": 0.53791189, "learning_rate": 5.868e-05, "elapsed_time_per_iteration": 6.45968747, "memory(GiB)": 21.51, "elapsed_time": "5h 34m 43s", "remaining_time": "6h 2m 34s", "loss_scale": 1.0, "consumed_samples": 781824, "global_step/max_steps": "3054/6362"} +{"lm loss": 4.93686152, "grad_norm": 0.55285966, "learning_rate": 5.866e-05, "elapsed_time_per_iteration": 6.71987653, "memory(GiB)": 21.51, "elapsed_time": "5h 34m 50s", "remaining_time": "6h 2m 27s", "loss_scale": 1.0, "consumed_samples": 782080, "global_step/max_steps": "3055/6362"} +{"lm loss": 4.95912981, "grad_norm": 0.63635099, "learning_rate": 5.863e-05, "elapsed_time_per_iteration": 6.39280987, "memory(GiB)": 21.51, "elapsed_time": "5h 34m 56s", "remaining_time": "6h 2m 20s", "loss_scale": 1.0, "consumed_samples": 782336, "global_step/max_steps": "3056/6362"} +{"lm loss": 4.93842745, "grad_norm": 0.51798713, "learning_rate": 5.861e-05, "elapsed_time_per_iteration": 6.91014266, "memory(GiB)": 21.51, "elapsed_time": "5h 35m 3s", "remaining_time": "6h 2m 14s", "loss_scale": 1.0, "consumed_samples": 782592, "global_step/max_steps": "3057/6362"} +{"lm loss": 4.93063164, "grad_norm": 0.5251354, "learning_rate": 5.859e-05, "elapsed_time_per_iteration": 6.69106817, "memory(GiB)": 21.51, "elapsed_time": "5h 35m 10s", "remaining_time": "6h 2m 8s", "loss_scale": 1.0, "consumed_samples": 782848, "global_step/max_steps": "3058/6362"} +{"lm loss": 4.92073441, "grad_norm": 0.54356831, "learning_rate": 5.856e-05, "elapsed_time_per_iteration": 6.54114532, "memory(GiB)": 21.51, "elapsed_time": "5h 35m 16s", "remaining_time": "6h 2m 1s", "loss_scale": 1.0, "consumed_samples": 783104, "global_step/max_steps": "3059/6362"} +{"lm loss": 4.92955637, "grad_norm": 0.59370601, "learning_rate": 5.854e-05, "elapsed_time_per_iteration": 6.40475655, "memory(GiB)": 21.51, "elapsed_time": "5h 35m 23s", "remaining_time": "6h 1m 54s", "loss_scale": 1.0, "consumed_samples": 783360, "global_step/max_steps": "3060/6362"} +{"lm loss": 4.93363857, "grad_norm": 0.57704902, "learning_rate": 5.851e-05, "elapsed_time_per_iteration": 6.31657791, "memory(GiB)": 21.51, "elapsed_time": "5h 35m 29s", "remaining_time": "6h 1m 47s", "loss_scale": 1.0, "consumed_samples": 783616, "global_step/max_steps": "3061/6362"} +{"lm loss": 4.9331584, "grad_norm": 0.49420863, "learning_rate": 5.849e-05, "elapsed_time_per_iteration": 6.38989067, "memory(GiB)": 21.51, "elapsed_time": "5h 35m 36s", "remaining_time": "6h 1m 41s", "loss_scale": 1.0, "consumed_samples": 783872, "global_step/max_steps": "3062/6362"} +{"lm loss": 4.9635129, "grad_norm": 0.55373853, "learning_rate": 5.846e-05, "elapsed_time_per_iteration": 6.44079518, "memory(GiB)": 21.51, "elapsed_time": "5h 35m 42s", "remaining_time": "6h 1m 34s", "loss_scale": 1.0, "consumed_samples": 784128, "global_step/max_steps": "3063/6362"} +{"lm loss": 4.9543643, "grad_norm": 0.57487404, "learning_rate": 5.844e-05, "elapsed_time_per_iteration": 6.56225491, "memory(GiB)": 21.51, "elapsed_time": "5h 35m 49s", "remaining_time": "6h 1m 27s", "loss_scale": 1.0, "consumed_samples": 784384, "global_step/max_steps": "3064/6362"} +{"lm loss": 4.92705011, "grad_norm": 0.49151137, "learning_rate": 5.841e-05, "elapsed_time_per_iteration": 6.4811399, "memory(GiB)": 21.51, "elapsed_time": "5h 35m 55s", "remaining_time": "6h 1m 21s", "loss_scale": 1.0, "consumed_samples": 784640, "global_step/max_steps": "3065/6362"} +{"lm loss": 4.94228315, "grad_norm": 0.52877629, "learning_rate": 5.839e-05, "elapsed_time_per_iteration": 6.50146961, "memory(GiB)": 21.51, "elapsed_time": "5h 36m 2s", "remaining_time": "6h 1m 14s", "loss_scale": 1.0, "consumed_samples": 784896, "global_step/max_steps": "3066/6362"} +{"lm loss": 4.95996141, "grad_norm": 0.48197958, "learning_rate": 5.836e-05, "elapsed_time_per_iteration": 6.49993181, "memory(GiB)": 21.51, "elapsed_time": "5h 36m 8s", "remaining_time": "6h 1m 7s", "loss_scale": 1.0, "consumed_samples": 785152, "global_step/max_steps": "3067/6362"} +{"lm loss": 4.93586159, "grad_norm": 0.5071137, "learning_rate": 5.834e-05, "elapsed_time_per_iteration": 6.83370996, "memory(GiB)": 21.51, "elapsed_time": "5h 36m 15s", "remaining_time": "6h 1m 1s", "loss_scale": 1.0, "consumed_samples": 785408, "global_step/max_steps": "3068/6362"} +{"lm loss": 4.94510603, "grad_norm": 0.57623124, "learning_rate": 5.831e-05, "elapsed_time_per_iteration": 6.49784517, "memory(GiB)": 21.51, "elapsed_time": "5h 36m 21s", "remaining_time": "6h 0m 54s", "loss_scale": 1.0, "consumed_samples": 785664, "global_step/max_steps": "3069/6362"} +{"lm loss": 4.92562962, "grad_norm": 0.51668578, "learning_rate": 5.829e-05, "elapsed_time_per_iteration": 6.79798198, "memory(GiB)": 21.51, "elapsed_time": "5h 36m 28s", "remaining_time": "6h 0m 48s", "loss_scale": 1.0, "consumed_samples": 785920, "global_step/max_steps": "3070/6362"} +{"lm loss": 4.91232061, "grad_norm": 0.52596217, "learning_rate": 5.826e-05, "elapsed_time_per_iteration": 6.54896784, "memory(GiB)": 21.51, "elapsed_time": "5h 36m 35s", "remaining_time": "6h 0m 41s", "loss_scale": 1.0, "consumed_samples": 786176, "global_step/max_steps": "3071/6362"} +{"lm loss": 4.95096397, "grad_norm": 0.59712189, "learning_rate": 5.824e-05, "elapsed_time_per_iteration": 6.36342645, "memory(GiB)": 21.51, "elapsed_time": "5h 36m 41s", "remaining_time": "6h 0m 35s", "loss_scale": 1.0, "consumed_samples": 786432, "global_step/max_steps": "3072/6362"} +{"lm loss": 4.93029785, "grad_norm": 0.568555, "learning_rate": 5.821e-05, "elapsed_time_per_iteration": 6.58747816, "memory(GiB)": 21.51, "elapsed_time": "5h 36m 48s", "remaining_time": "6h 0m 28s", "loss_scale": 1.0, "consumed_samples": 786688, "global_step/max_steps": "3073/6362"} +{"lm loss": 4.9595108, "grad_norm": 0.67353094, "learning_rate": 5.819e-05, "elapsed_time_per_iteration": 6.53781605, "memory(GiB)": 21.51, "elapsed_time": "5h 36m 54s", "remaining_time": "6h 0m 21s", "loss_scale": 1.0, "consumed_samples": 786944, "global_step/max_steps": "3074/6362"} +{"lm loss": 4.94520187, "grad_norm": 0.62285805, "learning_rate": 5.816e-05, "elapsed_time_per_iteration": 6.4451189, "memory(GiB)": 21.51, "elapsed_time": "5h 37m 1s", "remaining_time": "6h 0m 15s", "loss_scale": 1.0, "consumed_samples": 787200, "global_step/max_steps": "3075/6362"} +{"lm loss": 4.94335365, "grad_norm": 0.54110336, "learning_rate": 5.814e-05, "elapsed_time_per_iteration": 6.68591332, "memory(GiB)": 21.51, "elapsed_time": "5h 37m 7s", "remaining_time": "6h 0m 8s", "loss_scale": 1.0, "consumed_samples": 787456, "global_step/max_steps": "3076/6362"} +{"lm loss": 4.94387245, "grad_norm": 0.52147806, "learning_rate": 5.811e-05, "elapsed_time_per_iteration": 6.61805105, "memory(GiB)": 21.51, "elapsed_time": "5h 37m 14s", "remaining_time": "6h 0m 2s", "loss_scale": 1.0, "consumed_samples": 787712, "global_step/max_steps": "3077/6362"} +{"lm loss": 4.93202734, "grad_norm": 0.57049602, "learning_rate": 5.809e-05, "elapsed_time_per_iteration": 6.58372569, "memory(GiB)": 21.51, "elapsed_time": "5h 37m 21s", "remaining_time": "5h 59m 55s", "loss_scale": 1.0, "consumed_samples": 787968, "global_step/max_steps": "3078/6362"} +{"lm loss": 4.9333334, "grad_norm": 0.57289469, "learning_rate": 5.806e-05, "elapsed_time_per_iteration": 6.55361009, "memory(GiB)": 21.51, "elapsed_time": "5h 37m 27s", "remaining_time": "5h 59m 49s", "loss_scale": 1.0, "consumed_samples": 788224, "global_step/max_steps": "3079/6362"} +{"lm loss": 4.94056463, "grad_norm": 0.55036432, "learning_rate": 5.804e-05, "elapsed_time_per_iteration": 6.51516104, "memory(GiB)": 21.51, "elapsed_time": "5h 37m 34s", "remaining_time": "5h 59m 42s", "loss_scale": 1.0, "consumed_samples": 788480, "global_step/max_steps": "3080/6362"} +{"lm loss": 4.93492031, "grad_norm": 0.5121358, "learning_rate": 5.801e-05, "elapsed_time_per_iteration": 6.38898659, "memory(GiB)": 21.51, "elapsed_time": "5h 37m 40s", "remaining_time": "5h 59m 35s", "loss_scale": 1.0, "consumed_samples": 788736, "global_step/max_steps": "3081/6362"} +{"lm loss": 4.93801403, "grad_norm": 0.56604284, "learning_rate": 5.799e-05, "elapsed_time_per_iteration": 6.39348912, "memory(GiB)": 21.51, "elapsed_time": "5h 37m 46s", "remaining_time": "5h 59m 28s", "loss_scale": 1.0, "consumed_samples": 788992, "global_step/max_steps": "3082/6362"} +{"lm loss": 4.96190643, "grad_norm": 0.51707029, "learning_rate": 5.796e-05, "elapsed_time_per_iteration": 6.66949964, "memory(GiB)": 21.51, "elapsed_time": "5h 37m 53s", "remaining_time": "5h 59m 22s", "loss_scale": 1.0, "consumed_samples": 789248, "global_step/max_steps": "3083/6362"} +{"lm loss": 4.93716383, "grad_norm": 0.51769841, "learning_rate": 5.794e-05, "elapsed_time_per_iteration": 6.56478596, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 0s", "remaining_time": "5h 59m 15s", "loss_scale": 1.0, "consumed_samples": 789504, "global_step/max_steps": "3084/6362"} +{"lm loss": 4.93385696, "grad_norm": 0.51855576, "learning_rate": 5.791e-05, "elapsed_time_per_iteration": 6.50220561, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 6s", "remaining_time": "5h 59m 9s", "loss_scale": 1.0, "consumed_samples": 789760, "global_step/max_steps": "3085/6362"} +{"lm loss": 4.91456223, "grad_norm": 0.49460232, "learning_rate": 5.789e-05, "elapsed_time_per_iteration": 6.41498184, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 13s", "remaining_time": "5h 59m 2s", "loss_scale": 1.0, "consumed_samples": 790016, "global_step/max_steps": "3086/6362"} +{"lm loss": 4.93995476, "grad_norm": 0.50738895, "learning_rate": 5.786e-05, "elapsed_time_per_iteration": 6.68052602, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 19s", "remaining_time": "5h 58m 55s", "loss_scale": 1.0, "consumed_samples": 790272, "global_step/max_steps": "3087/6362"} +{"lm loss": 4.93297386, "grad_norm": 0.46152318, "learning_rate": 5.784e-05, "elapsed_time_per_iteration": 6.50287557, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 26s", "remaining_time": "5h 58m 49s", "loss_scale": 1.0, "consumed_samples": 790528, "global_step/max_steps": "3088/6362"} +{"lm loss": 4.95310974, "grad_norm": 0.46787283, "learning_rate": 5.781e-05, "elapsed_time_per_iteration": 6.54802251, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 32s", "remaining_time": "5h 58m 42s", "loss_scale": 1.0, "consumed_samples": 790784, "global_step/max_steps": "3089/6362"} +{"lm loss": 4.9326582, "grad_norm": 0.46471345, "learning_rate": 5.779e-05, "elapsed_time_per_iteration": 6.5068779, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 39s", "remaining_time": "5h 58m 36s", "loss_scale": 1.0, "consumed_samples": 791040, "global_step/max_steps": "3090/6362"} +{"lm loss": 4.95478678, "grad_norm": 0.48793212, "learning_rate": 5.776e-05, "elapsed_time_per_iteration": 6.52127814, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 45s", "remaining_time": "5h 58m 29s", "loss_scale": 1.0, "consumed_samples": 791296, "global_step/max_steps": "3091/6362"} +{"lm loss": 4.92617178, "grad_norm": 0.47486025, "learning_rate": 5.774e-05, "elapsed_time_per_iteration": 6.72042012, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 52s", "remaining_time": "5h 58m 23s", "loss_scale": 1.0, "consumed_samples": 791552, "global_step/max_steps": "3092/6362"} +{"lm loss": 4.94874096, "grad_norm": 0.56155628, "learning_rate": 5.771e-05, "elapsed_time_per_iteration": 6.60256171, "memory(GiB)": 21.51, "elapsed_time": "5h 38m 59s", "remaining_time": "5h 58m 16s", "loss_scale": 1.0, "consumed_samples": 791808, "global_step/max_steps": "3093/6362"} +{"lm loss": 4.96203947, "grad_norm": 0.7223419, "learning_rate": 5.769e-05, "elapsed_time_per_iteration": 6.59353733, "memory(GiB)": 21.51, "elapsed_time": "5h 39m 5s", "remaining_time": "5h 58m 9s", "loss_scale": 1.0, "consumed_samples": 792064, "global_step/max_steps": "3094/6362"} +{"lm loss": 4.96160221, "grad_norm": 0.86190921, "learning_rate": 5.766e-05, "elapsed_time_per_iteration": 6.69715858, "memory(GiB)": 21.51, "elapsed_time": "5h 39m 12s", "remaining_time": "5h 58m 3s", "loss_scale": 1.0, "consumed_samples": 792320, "global_step/max_steps": "3095/6362"} +{"lm loss": 4.93818045, "grad_norm": 0.67750144, "learning_rate": 5.764e-05, "elapsed_time_per_iteration": 6.82915163, "memory(GiB)": 21.51, "elapsed_time": "5h 39m 19s", "remaining_time": "5h 57m 57s", "loss_scale": 1.0, "consumed_samples": 792576, "global_step/max_steps": "3096/6362"} +{"lm loss": 4.97476435, "grad_norm": 0.51421446, "learning_rate": 5.761e-05, "elapsed_time_per_iteration": 6.42028308, "memory(GiB)": 21.51, "elapsed_time": "5h 39m 25s", "remaining_time": "5h 57m 50s", "loss_scale": 1.0, "consumed_samples": 792832, "global_step/max_steps": "3097/6362"} +{"lm loss": 4.93962383, "grad_norm": 0.67537415, "learning_rate": 5.759e-05, "elapsed_time_per_iteration": 6.53672075, "memory(GiB)": 21.51, "elapsed_time": "5h 39m 32s", "remaining_time": "5h 57m 43s", "loss_scale": 1.0, "consumed_samples": 793088, "global_step/max_steps": "3098/6362"} +{"lm loss": 4.93910027, "grad_norm": 0.69874626, "learning_rate": 5.756e-05, "elapsed_time_per_iteration": 6.34249234, "memory(GiB)": 21.51, "elapsed_time": "5h 39m 38s", "remaining_time": "5h 57m 36s", "loss_scale": 1.0, "consumed_samples": 793344, "global_step/max_steps": "3099/6362"} +{"lm loss": 4.94036198, "grad_norm": 0.51642293, "learning_rate": 5.754e-05, "elapsed_time_per_iteration": 6.54439354, "memory(GiB)": 21.51, "elapsed_time": "5h 39m 45s", "remaining_time": "5h 57m 30s", "loss_scale": 1.0, "consumed_samples": 793600, "global_step/max_steps": "3100/6362"} +{"lm loss": 4.91221523, "grad_norm": 0.52141637, "learning_rate": 5.751e-05, "elapsed_time_per_iteration": 6.39902496, "memory(GiB)": 21.51, "elapsed_time": "5h 39m 51s", "remaining_time": "5h 57m 23s", "loss_scale": 1.0, "consumed_samples": 793856, "global_step/max_steps": "3101/6362"} +{"lm loss": 4.9450922, "grad_norm": 0.60234135, "learning_rate": 5.749e-05, "elapsed_time_per_iteration": 6.53915048, "memory(GiB)": 21.51, "elapsed_time": "5h 39m 58s", "remaining_time": "5h 57m 17s", "loss_scale": 1.0, "consumed_samples": 794112, "global_step/max_steps": "3102/6362"} +{"lm loss": 4.94300938, "grad_norm": 0.57857281, "learning_rate": 5.746e-05, "elapsed_time_per_iteration": 6.4888649, "memory(GiB)": 21.51, "elapsed_time": "5h 40m 4s", "remaining_time": "5h 57m 10s", "loss_scale": 1.0, "consumed_samples": 794368, "global_step/max_steps": "3103/6362"} +{"lm loss": 4.93663025, "grad_norm": 0.47509038, "learning_rate": 5.744e-05, "elapsed_time_per_iteration": 6.66353822, "memory(GiB)": 21.51, "elapsed_time": "5h 40m 11s", "remaining_time": "5h 57m 3s", "loss_scale": 1.0, "consumed_samples": 794624, "global_step/max_steps": "3104/6362"} +{"lm loss": 4.91455412, "grad_norm": 0.47194782, "learning_rate": 5.741e-05, "elapsed_time_per_iteration": 6.4597671, "memory(GiB)": 21.51, "elapsed_time": "5h 40m 17s", "remaining_time": "5h 56m 57s", "loss_scale": 1.0, "consumed_samples": 794880, "global_step/max_steps": "3105/6362"} +{"lm loss": 4.97270775, "grad_norm": 0.61338764, "learning_rate": 5.739e-05, "elapsed_time_per_iteration": 6.53654575, "memory(GiB)": 21.51, "elapsed_time": "5h 40m 24s", "remaining_time": "5h 56m 50s", "loss_scale": 1.0, "consumed_samples": 795136, "global_step/max_steps": "3106/6362"} +{"lm loss": 4.93280745, "grad_norm": 0.60541999, "learning_rate": 5.736e-05, "elapsed_time_per_iteration": 6.72521377, "memory(GiB)": 21.51, "elapsed_time": "5h 40m 30s", "remaining_time": "5h 56m 44s", "loss_scale": 1.0, "consumed_samples": 795392, "global_step/max_steps": "3107/6362"} +{"lm loss": 4.97223854, "grad_norm": 0.50191146, "learning_rate": 5.734e-05, "elapsed_time_per_iteration": 6.92587495, "memory(GiB)": 21.51, "elapsed_time": "5h 40m 37s", "remaining_time": "5h 56m 37s", "loss_scale": 1.0, "consumed_samples": 795648, "global_step/max_steps": "3108/6362"} +{"lm loss": 4.93288517, "grad_norm": 0.53007233, "learning_rate": 5.731e-05, "elapsed_time_per_iteration": 6.51039648, "memory(GiB)": 21.51, "elapsed_time": "5h 40m 44s", "remaining_time": "5h 56m 31s", "loss_scale": 1.0, "consumed_samples": 795904, "global_step/max_steps": "3109/6362"} +{"lm loss": 4.92841101, "grad_norm": 0.47038984, "learning_rate": 5.729e-05, "elapsed_time_per_iteration": 6.71120024, "memory(GiB)": 21.51, "elapsed_time": "5h 40m 51s", "remaining_time": "5h 56m 24s", "loss_scale": 1.0, "consumed_samples": 796160, "global_step/max_steps": "3110/6362"} +{"lm loss": 4.94988585, "grad_norm": 0.48828346, "learning_rate": 5.726e-05, "elapsed_time_per_iteration": 6.66719842, "memory(GiB)": 21.51, "elapsed_time": "5h 40m 57s", "remaining_time": "5h 56m 18s", "loss_scale": 1.0, "consumed_samples": 796416, "global_step/max_steps": "3111/6362"} +{"lm loss": 4.96482611, "grad_norm": 0.46350369, "learning_rate": 5.724e-05, "elapsed_time_per_iteration": 6.81217623, "memory(GiB)": 21.51, "elapsed_time": "5h 41m 4s", "remaining_time": "5h 56m 12s", "loss_scale": 1.0, "consumed_samples": 796672, "global_step/max_steps": "3112/6362"} +{"lm loss": 4.97639894, "grad_norm": 0.43809426, "learning_rate": 5.721e-05, "elapsed_time_per_iteration": 6.62617493, "memory(GiB)": 21.51, "elapsed_time": "5h 41m 11s", "remaining_time": "5h 56m 5s", "loss_scale": 1.0, "consumed_samples": 796928, "global_step/max_steps": "3113/6362"} +{"lm loss": 4.92790174, "grad_norm": 0.45985252, "learning_rate": 5.719e-05, "elapsed_time_per_iteration": 6.67592478, "memory(GiB)": 21.51, "elapsed_time": "5h 41m 17s", "remaining_time": "5h 55m 59s", "loss_scale": 1.0, "consumed_samples": 797184, "global_step/max_steps": "3114/6362"} +{"lm loss": 4.92400026, "grad_norm": 0.45292535, "learning_rate": 5.716e-05, "elapsed_time_per_iteration": 6.57232475, "memory(GiB)": 21.51, "elapsed_time": "5h 41m 24s", "remaining_time": "5h 55m 52s", "loss_scale": 1.0, "consumed_samples": 797440, "global_step/max_steps": "3115/6362"} +{"lm loss": 4.95366144, "grad_norm": 0.53842515, "learning_rate": 5.714e-05, "elapsed_time_per_iteration": 6.48093104, "memory(GiB)": 21.51, "elapsed_time": "5h 41m 30s", "remaining_time": "5h 55m 45s", "loss_scale": 1.0, "consumed_samples": 797696, "global_step/max_steps": "3116/6362"} +{"lm loss": 4.94594669, "grad_norm": 0.54333311, "learning_rate": 5.711e-05, "elapsed_time_per_iteration": 6.47201943, "memory(GiB)": 21.51, "elapsed_time": "5h 41m 37s", "remaining_time": "5h 55m 39s", "loss_scale": 1.0, "consumed_samples": 797952, "global_step/max_steps": "3117/6362"} +{"lm loss": 4.93486595, "grad_norm": 0.48898262, "learning_rate": 5.709e-05, "elapsed_time_per_iteration": 6.4323771, "memory(GiB)": 21.51, "elapsed_time": "5h 41m 43s", "remaining_time": "5h 55m 32s", "loss_scale": 1.0, "consumed_samples": 798208, "global_step/max_steps": "3118/6362"} +{"lm loss": 4.92218971, "grad_norm": 0.4591299, "learning_rate": 5.706e-05, "elapsed_time_per_iteration": 6.50135279, "memory(GiB)": 21.51, "elapsed_time": "5h 41m 50s", "remaining_time": "5h 55m 25s", "loss_scale": 1.0, "consumed_samples": 798464, "global_step/max_steps": "3119/6362"} +{"lm loss": 4.9300766, "grad_norm": 0.4547675, "learning_rate": 5.704e-05, "elapsed_time_per_iteration": 6.65886998, "memory(GiB)": 21.51, "elapsed_time": "5h 41m 56s", "remaining_time": "5h 55m 19s", "loss_scale": 1.0, "consumed_samples": 798720, "global_step/max_steps": "3120/6362"} +{"lm loss": 4.92652416, "grad_norm": 0.52314901, "learning_rate": 5.701e-05, "elapsed_time_per_iteration": 6.39114165, "memory(GiB)": 21.51, "elapsed_time": "5h 42m 3s", "remaining_time": "5h 55m 12s", "loss_scale": 1.0, "consumed_samples": 798976, "global_step/max_steps": "3121/6362"} +{"lm loss": 4.93017483, "grad_norm": 0.57817233, "learning_rate": 5.699e-05, "elapsed_time_per_iteration": 6.41314101, "memory(GiB)": 21.51, "elapsed_time": "5h 42m 9s", "remaining_time": "5h 55m 5s", "loss_scale": 1.0, "consumed_samples": 799232, "global_step/max_steps": "3122/6362"} +{"lm loss": 4.91831589, "grad_norm": 0.53710586, "learning_rate": 5.696e-05, "elapsed_time_per_iteration": 6.3923285, "memory(GiB)": 21.51, "elapsed_time": "5h 42m 16s", "remaining_time": "5h 54m 58s", "loss_scale": 1.0, "consumed_samples": 799488, "global_step/max_steps": "3123/6362"} +{"lm loss": 4.94364595, "grad_norm": 0.52874047, "learning_rate": 5.694e-05, "elapsed_time_per_iteration": 6.54946637, "memory(GiB)": 21.51, "elapsed_time": "5h 42m 22s", "remaining_time": "5h 54m 52s", "loss_scale": 1.0, "consumed_samples": 799744, "global_step/max_steps": "3124/6362"} +{"lm loss": 4.93757057, "grad_norm": 0.46946046, "learning_rate": 5.691e-05, "elapsed_time_per_iteration": 6.52567053, "memory(GiB)": 21.51, "elapsed_time": "5h 42m 29s", "remaining_time": "5h 54m 45s", "loss_scale": 1.0, "consumed_samples": 800000, "global_step/max_steps": "3125/6362"} +{"lm loss": 4.9092927, "grad_norm": 0.45117256, "learning_rate": 5.689e-05, "elapsed_time_per_iteration": 6.59788871, "memory(GiB)": 21.51, "elapsed_time": "5h 42m 35s", "remaining_time": "5h 54m 39s", "loss_scale": 1.0, "consumed_samples": 800256, "global_step/max_steps": "3126/6362"} +{"lm loss": 4.94210434, "grad_norm": 0.51813656, "learning_rate": 5.686e-05, "elapsed_time_per_iteration": 6.54929328, "memory(GiB)": 21.51, "elapsed_time": "5h 42m 42s", "remaining_time": "5h 54m 32s", "loss_scale": 1.0, "consumed_samples": 800512, "global_step/max_steps": "3127/6362"} +{"lm loss": 4.95287991, "grad_norm": 0.51784605, "learning_rate": 5.683e-05, "elapsed_time_per_iteration": 6.5623529, "memory(GiB)": 21.51, "elapsed_time": "5h 42m 48s", "remaining_time": "5h 54m 25s", "loss_scale": 1.0, "consumed_samples": 800768, "global_step/max_steps": "3128/6362"} +{"lm loss": 4.9182806, "grad_norm": 0.58817935, "learning_rate": 5.681e-05, "elapsed_time_per_iteration": 6.69792771, "memory(GiB)": 21.51, "elapsed_time": "5h 42m 55s", "remaining_time": "5h 54m 19s", "loss_scale": 1.0, "consumed_samples": 801024, "global_step/max_steps": "3129/6362"} +{"lm loss": 4.9226613, "grad_norm": 0.5283193, "learning_rate": 5.678e-05, "elapsed_time_per_iteration": 6.4976027, "memory(GiB)": 21.51, "elapsed_time": "5h 43m 2s", "remaining_time": "5h 54m 12s", "loss_scale": 1.0, "consumed_samples": 801280, "global_step/max_steps": "3130/6362"} +{"lm loss": 4.92704678, "grad_norm": 0.45965317, "learning_rate": 5.676e-05, "elapsed_time_per_iteration": 6.68703914, "memory(GiB)": 21.51, "elapsed_time": "5h 43m 8s", "remaining_time": "5h 54m 6s", "loss_scale": 1.0, "consumed_samples": 801536, "global_step/max_steps": "3131/6362"} +{"lm loss": 4.90245295, "grad_norm": 0.56420422, "learning_rate": 5.673e-05, "elapsed_time_per_iteration": 6.97376037, "memory(GiB)": 21.51, "elapsed_time": "5h 43m 15s", "remaining_time": "5h 54m 0s", "loss_scale": 1.0, "consumed_samples": 801792, "global_step/max_steps": "3132/6362"} +{"lm loss": 4.95278502, "grad_norm": 0.50161374, "learning_rate": 5.671e-05, "elapsed_time_per_iteration": 6.66947842, "memory(GiB)": 21.51, "elapsed_time": "5h 43m 22s", "remaining_time": "5h 53m 53s", "loss_scale": 1.0, "consumed_samples": 802048, "global_step/max_steps": "3133/6362"} +{"lm loss": 4.93245506, "grad_norm": 0.53598636, "learning_rate": 5.668e-05, "elapsed_time_per_iteration": 6.86631799, "memory(GiB)": 21.51, "elapsed_time": "5h 43m 29s", "remaining_time": "5h 53m 47s", "loss_scale": 1.0, "consumed_samples": 802304, "global_step/max_steps": "3134/6362"} +{"lm loss": 4.9363656, "grad_norm": 0.64693385, "learning_rate": 5.666e-05, "elapsed_time_per_iteration": 6.64577293, "memory(GiB)": 21.51, "elapsed_time": "5h 43m 35s", "remaining_time": "5h 53m 40s", "loss_scale": 1.0, "consumed_samples": 802560, "global_step/max_steps": "3135/6362"} +{"lm loss": 4.93455029, "grad_norm": 0.54624391, "learning_rate": 5.663e-05, "elapsed_time_per_iteration": 6.55089164, "memory(GiB)": 21.51, "elapsed_time": "5h 43m 42s", "remaining_time": "5h 53m 34s", "loss_scale": 1.0, "consumed_samples": 802816, "global_step/max_steps": "3136/6362"} +{"lm loss": 4.95681429, "grad_norm": 0.58155519, "learning_rate": 5.661e-05, "elapsed_time_per_iteration": 6.45667243, "memory(GiB)": 21.51, "elapsed_time": "5h 43m 48s", "remaining_time": "5h 53m 27s", "loss_scale": 1.0, "consumed_samples": 803072, "global_step/max_steps": "3137/6362"} +{"lm loss": 4.93043137, "grad_norm": 0.59680998, "learning_rate": 5.658e-05, "elapsed_time_per_iteration": 6.75357461, "memory(GiB)": 21.51, "elapsed_time": "5h 43m 55s", "remaining_time": "5h 53m 21s", "loss_scale": 1.0, "consumed_samples": 803328, "global_step/max_steps": "3138/6362"} +{"lm loss": 4.92964077, "grad_norm": 0.55020773, "learning_rate": 5.656e-05, "elapsed_time_per_iteration": 6.56789756, "memory(GiB)": 21.51, "elapsed_time": "5h 44m 2s", "remaining_time": "5h 53m 14s", "loss_scale": 1.0, "consumed_samples": 803584, "global_step/max_steps": "3139/6362"} +{"lm loss": 4.93049812, "grad_norm": 0.59194332, "learning_rate": 5.653e-05, "elapsed_time_per_iteration": 6.6212585, "memory(GiB)": 21.51, "elapsed_time": "5h 44m 8s", "remaining_time": "5h 53m 8s", "loss_scale": 1.0, "consumed_samples": 803840, "global_step/max_steps": "3140/6362"} +{"lm loss": 4.9534297, "grad_norm": 0.50848252, "learning_rate": 5.651e-05, "elapsed_time_per_iteration": 6.63438153, "memory(GiB)": 21.51, "elapsed_time": "5h 44m 15s", "remaining_time": "5h 53m 1s", "loss_scale": 1.0, "consumed_samples": 804096, "global_step/max_steps": "3141/6362"} +{"lm loss": 4.92822504, "grad_norm": 0.53155935, "learning_rate": 5.648e-05, "elapsed_time_per_iteration": 6.62776542, "memory(GiB)": 21.51, "elapsed_time": "5h 44m 22s", "remaining_time": "5h 52m 55s", "loss_scale": 1.0, "consumed_samples": 804352, "global_step/max_steps": "3142/6362"} +{"lm loss": 4.9505229, "grad_norm": 0.58256966, "learning_rate": 5.646e-05, "elapsed_time_per_iteration": 6.69561601, "memory(GiB)": 21.51, "elapsed_time": "5h 44m 28s", "remaining_time": "5h 52m 48s", "loss_scale": 1.0, "consumed_samples": 804608, "global_step/max_steps": "3143/6362"} +{"lm loss": 4.95645571, "grad_norm": 0.52240729, "learning_rate": 5.643e-05, "elapsed_time_per_iteration": 6.66355729, "memory(GiB)": 21.51, "elapsed_time": "5h 44m 35s", "remaining_time": "5h 52m 42s", "loss_scale": 1.0, "consumed_samples": 804864, "global_step/max_steps": "3144/6362"} +{"lm loss": 4.96702003, "grad_norm": 0.61686778, "learning_rate": 5.641e-05, "elapsed_time_per_iteration": 6.50544763, "memory(GiB)": 21.51, "elapsed_time": "5h 44m 42s", "remaining_time": "5h 52m 35s", "loss_scale": 1.0, "consumed_samples": 805120, "global_step/max_steps": "3145/6362"} +{"lm loss": 4.95803976, "grad_norm": 0.52163708, "learning_rate": 5.638e-05, "elapsed_time_per_iteration": 6.39565778, "memory(GiB)": 21.51, "elapsed_time": "5h 44m 48s", "remaining_time": "5h 52m 28s", "loss_scale": 1.0, "consumed_samples": 805376, "global_step/max_steps": "3146/6362"} +{"lm loss": 4.92858267, "grad_norm": 0.53942102, "learning_rate": 5.636e-05, "elapsed_time_per_iteration": 6.43497777, "memory(GiB)": 21.51, "elapsed_time": "5h 44m 54s", "remaining_time": "5h 52m 22s", "loss_scale": 1.0, "consumed_samples": 805632, "global_step/max_steps": "3147/6362"} +{"lm loss": 4.95058584, "grad_norm": 0.46953174, "learning_rate": 5.633e-05, "elapsed_time_per_iteration": 6.57576585, "memory(GiB)": 21.51, "elapsed_time": "5h 45m 1s", "remaining_time": "5h 52m 15s", "loss_scale": 1.0, "consumed_samples": 805888, "global_step/max_steps": "3148/6362"} +{"lm loss": 4.91882706, "grad_norm": 0.50671035, "learning_rate": 5.631e-05, "elapsed_time_per_iteration": 6.73640847, "memory(GiB)": 21.51, "elapsed_time": "5h 45m 8s", "remaining_time": "5h 52m 9s", "loss_scale": 1.0, "consumed_samples": 806144, "global_step/max_steps": "3149/6362"} +{"lm loss": 4.9300952, "grad_norm": 0.5724532, "learning_rate": 5.628e-05, "elapsed_time_per_iteration": 6.58292055, "memory(GiB)": 21.51, "elapsed_time": "5h 45m 14s", "remaining_time": "5h 52m 2s", "loss_scale": 1.0, "consumed_samples": 806400, "global_step/max_steps": "3150/6362"} +{"lm loss": 4.94407415, "grad_norm": 0.45283183, "learning_rate": 5.626e-05, "elapsed_time_per_iteration": 6.30342889, "memory(GiB)": 21.51, "elapsed_time": "5h 45m 21s", "remaining_time": "5h 51m 55s", "loss_scale": 1.0, "consumed_samples": 806656, "global_step/max_steps": "3151/6362"} +{"lm loss": 4.94188929, "grad_norm": 0.45490733, "learning_rate": 5.623e-05, "elapsed_time_per_iteration": 6.42067647, "memory(GiB)": 21.51, "elapsed_time": "5h 45m 27s", "remaining_time": "5h 51m 48s", "loss_scale": 1.0, "consumed_samples": 806912, "global_step/max_steps": "3152/6362"} +{"lm loss": 4.92326212, "grad_norm": 0.55206728, "learning_rate": 5.621e-05, "elapsed_time_per_iteration": 6.41245413, "memory(GiB)": 21.51, "elapsed_time": "5h 45m 33s", "remaining_time": "5h 51m 42s", "loss_scale": 1.0, "consumed_samples": 807168, "global_step/max_steps": "3153/6362"} +{"lm loss": 4.90990353, "grad_norm": 0.53409201, "learning_rate": 5.618e-05, "elapsed_time_per_iteration": 6.72435427, "memory(GiB)": 21.51, "elapsed_time": "5h 45m 40s", "remaining_time": "5h 51m 35s", "loss_scale": 1.0, "consumed_samples": 807424, "global_step/max_steps": "3154/6362"} +{"lm loss": 4.92852116, "grad_norm": 0.61453843, "learning_rate": 5.616e-05, "elapsed_time_per_iteration": 6.62952733, "memory(GiB)": 21.51, "elapsed_time": "5h 45m 47s", "remaining_time": "5h 51m 29s", "loss_scale": 1.0, "consumed_samples": 807680, "global_step/max_steps": "3155/6362"} +{"lm loss": 4.92812967, "grad_norm": 0.57568622, "learning_rate": 5.613e-05, "elapsed_time_per_iteration": 6.54826856, "memory(GiB)": 21.51, "elapsed_time": "5h 45m 53s", "remaining_time": "5h 51m 22s", "loss_scale": 1.0, "consumed_samples": 807936, "global_step/max_steps": "3156/6362"} +{"lm loss": 4.93331575, "grad_norm": 0.51362151, "learning_rate": 5.611e-05, "elapsed_time_per_iteration": 6.77794576, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 0s", "remaining_time": "5h 51m 16s", "loss_scale": 1.0, "consumed_samples": 808192, "global_step/max_steps": "3157/6362"} +{"lm loss": 4.9251194, "grad_norm": 0.51657414, "learning_rate": 5.608e-05, "elapsed_time_per_iteration": 6.68162179, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 7s", "remaining_time": "5h 51m 9s", "loss_scale": 1.0, "consumed_samples": 808448, "global_step/max_steps": "3158/6362"} +{"lm loss": 4.94933128, "grad_norm": 0.5197525, "learning_rate": 5.606e-05, "elapsed_time_per_iteration": 6.53688335, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 13s", "remaining_time": "5h 51m 3s", "loss_scale": 1.0, "consumed_samples": 808704, "global_step/max_steps": "3159/6362"} +{"lm loss": 4.94920397, "grad_norm": 0.51766372, "learning_rate": 5.603e-05, "elapsed_time_per_iteration": 6.62165785, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 20s", "remaining_time": "5h 50m 56s", "loss_scale": 1.0, "consumed_samples": 808960, "global_step/max_steps": "3160/6362"} +{"lm loss": 4.92524385, "grad_norm": 0.48099348, "learning_rate": 5.601e-05, "elapsed_time_per_iteration": 6.41735077, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 26s", "remaining_time": "5h 50m 49s", "loss_scale": 1.0, "consumed_samples": 809216, "global_step/max_steps": "3161/6362"} +{"lm loss": 4.93503284, "grad_norm": 0.52582163, "learning_rate": 5.598e-05, "elapsed_time_per_iteration": 6.39017177, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 33s", "remaining_time": "5h 50m 43s", "loss_scale": 1.0, "consumed_samples": 809472, "global_step/max_steps": "3162/6362"} +{"lm loss": 4.90921116, "grad_norm": 0.59614915, "learning_rate": 5.596e-05, "elapsed_time_per_iteration": 6.44967318, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 39s", "remaining_time": "5h 50m 36s", "loss_scale": 1.0, "consumed_samples": 809728, "global_step/max_steps": "3163/6362"} +{"lm loss": 4.92128563, "grad_norm": 0.6048758, "learning_rate": 5.593e-05, "elapsed_time_per_iteration": 6.74980593, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 46s", "remaining_time": "5h 50m 30s", "loss_scale": 1.0, "consumed_samples": 809984, "global_step/max_steps": "3164/6362"} +{"lm loss": 4.93671417, "grad_norm": 0.5844788, "learning_rate": 5.591e-05, "elapsed_time_per_iteration": 6.49133372, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 52s", "remaining_time": "5h 50m 23s", "loss_scale": 1.0, "consumed_samples": 810240, "global_step/max_steps": "3165/6362"} +{"lm loss": 4.92990303, "grad_norm": 0.5077154, "learning_rate": 5.588e-05, "elapsed_time_per_iteration": 6.47350478, "memory(GiB)": 21.51, "elapsed_time": "5h 46m 59s", "remaining_time": "5h 50m 16s", "loss_scale": 1.0, "consumed_samples": 810496, "global_step/max_steps": "3166/6362"} +{"lm loss": 4.95374727, "grad_norm": 0.60182005, "learning_rate": 5.586e-05, "elapsed_time_per_iteration": 6.42046022, "memory(GiB)": 21.51, "elapsed_time": "5h 47m 5s", "remaining_time": "5h 50m 9s", "loss_scale": 1.0, "consumed_samples": 810752, "global_step/max_steps": "3167/6362"} +{"lm loss": 4.95277548, "grad_norm": 0.59694082, "learning_rate": 5.583e-05, "elapsed_time_per_iteration": 6.54657197, "memory(GiB)": 21.51, "elapsed_time": "5h 47m 12s", "remaining_time": "5h 50m 3s", "loss_scale": 1.0, "consumed_samples": 811008, "global_step/max_steps": "3168/6362"} +{"lm loss": 4.91440916, "grad_norm": 0.54346949, "learning_rate": 5.581e-05, "elapsed_time_per_iteration": 6.46119261, "memory(GiB)": 21.51, "elapsed_time": "5h 47m 18s", "remaining_time": "5h 49m 56s", "loss_scale": 1.0, "consumed_samples": 811264, "global_step/max_steps": "3169/6362"} +{"lm loss": 4.9291358, "grad_norm": 0.46771002, "learning_rate": 5.578e-05, "elapsed_time_per_iteration": 6.51149273, "memory(GiB)": 21.51, "elapsed_time": "5h 47m 25s", "remaining_time": "5h 49m 50s", "loss_scale": 1.0, "consumed_samples": 811520, "global_step/max_steps": "3170/6362"} +{"lm loss": 4.93647242, "grad_norm": 0.54772383, "learning_rate": 5.576e-05, "elapsed_time_per_iteration": 6.50347114, "memory(GiB)": 21.51, "elapsed_time": "5h 47m 31s", "remaining_time": "5h 49m 43s", "loss_scale": 1.0, "consumed_samples": 811776, "global_step/max_steps": "3171/6362"} +{"lm loss": 4.93592691, "grad_norm": 0.65466517, "learning_rate": 5.573e-05, "elapsed_time_per_iteration": 6.76439857, "memory(GiB)": 21.51, "elapsed_time": "5h 47m 38s", "remaining_time": "5h 49m 36s", "loss_scale": 1.0, "consumed_samples": 812032, "global_step/max_steps": "3172/6362"} +{"lm loss": 4.93520212, "grad_norm": 0.65061873, "learning_rate": 5.571e-05, "elapsed_time_per_iteration": 6.45942163, "memory(GiB)": 21.51, "elapsed_time": "5h 47m 45s", "remaining_time": "5h 49m 30s", "loss_scale": 1.0, "consumed_samples": 812288, "global_step/max_steps": "3173/6362"} +{"lm loss": 4.92555857, "grad_norm": 0.56864035, "learning_rate": 5.568e-05, "elapsed_time_per_iteration": 6.36671042, "memory(GiB)": 21.51, "elapsed_time": "5h 47m 51s", "remaining_time": "5h 49m 23s", "loss_scale": 1.0, "consumed_samples": 812544, "global_step/max_steps": "3174/6362"} +{"lm loss": 4.92195415, "grad_norm": 0.48592809, "learning_rate": 5.566e-05, "elapsed_time_per_iteration": 6.555619, "memory(GiB)": 21.51, "elapsed_time": "5h 47m 57s", "remaining_time": "5h 49m 16s", "loss_scale": 1.0, "consumed_samples": 812800, "global_step/max_steps": "3175/6362"} +{"lm loss": 4.92974615, "grad_norm": 0.57105666, "learning_rate": 5.563e-05, "elapsed_time_per_iteration": 6.45240951, "memory(GiB)": 21.51, "elapsed_time": "5h 48m 4s", "remaining_time": "5h 49m 10s", "loss_scale": 1.0, "consumed_samples": 813056, "global_step/max_steps": "3176/6362"} +{"lm loss": 4.93774462, "grad_norm": 0.51073748, "learning_rate": 5.561e-05, "elapsed_time_per_iteration": 6.53198314, "memory(GiB)": 21.51, "elapsed_time": "5h 48m 10s", "remaining_time": "5h 49m 3s", "loss_scale": 1.0, "consumed_samples": 813312, "global_step/max_steps": "3177/6362"} +{"lm loss": 4.94263363, "grad_norm": 0.57606894, "learning_rate": 5.558e-05, "elapsed_time_per_iteration": 6.63163304, "memory(GiB)": 21.51, "elapsed_time": "5h 48m 17s", "remaining_time": "5h 48m 57s", "loss_scale": 1.0, "consumed_samples": 813568, "global_step/max_steps": "3178/6362"} +{"lm loss": 4.91916895, "grad_norm": 0.52846891, "learning_rate": 5.556e-05, "elapsed_time_per_iteration": 6.43705177, "memory(GiB)": 21.51, "elapsed_time": "5h 48m 24s", "remaining_time": "5h 48m 50s", "loss_scale": 1.0, "consumed_samples": 813824, "global_step/max_steps": "3179/6362"} +{"lm loss": 4.91430855, "grad_norm": 0.51615274, "learning_rate": 5.553e-05, "elapsed_time_per_iteration": 6.38322949, "memory(GiB)": 21.51, "elapsed_time": "5h 48m 30s", "remaining_time": "5h 48m 43s", "loss_scale": 1.0, "consumed_samples": 814080, "global_step/max_steps": "3180/6362"} +{"lm loss": 4.90575838, "grad_norm": 0.53017175, "learning_rate": 5.551e-05, "elapsed_time_per_iteration": 6.7550447, "memory(GiB)": 21.51, "elapsed_time": "5h 48m 37s", "remaining_time": "5h 48m 37s", "loss_scale": 1.0, "consumed_samples": 814336, "global_step/max_steps": "3181/6362"} +{"lm loss": 4.93479109, "grad_norm": 0.50742304, "learning_rate": 5.548e-05, "elapsed_time_per_iteration": 6.55476165, "memory(GiB)": 21.51, "elapsed_time": "5h 48m 43s", "remaining_time": "5h 48m 30s", "loss_scale": 1.0, "consumed_samples": 814592, "global_step/max_steps": "3182/6362"} +{"lm loss": 4.92707777, "grad_norm": 0.51295203, "learning_rate": 5.545e-05, "elapsed_time_per_iteration": 6.44472599, "memory(GiB)": 21.51, "elapsed_time": "5h 48m 50s", "remaining_time": "5h 48m 23s", "loss_scale": 1.0, "consumed_samples": 814848, "global_step/max_steps": "3183/6362"} +{"lm loss": 4.9333744, "grad_norm": 0.50156218, "learning_rate": 5.543e-05, "elapsed_time_per_iteration": 6.43904972, "memory(GiB)": 21.51, "elapsed_time": "5h 48m 56s", "remaining_time": "5h 48m 17s", "loss_scale": 1.0, "consumed_samples": 815104, "global_step/max_steps": "3184/6362"} +{"lm loss": 4.93649673, "grad_norm": 0.48729613, "learning_rate": 5.54e-05, "elapsed_time_per_iteration": 6.54338074, "memory(GiB)": 21.51, "elapsed_time": "5h 49m 3s", "remaining_time": "5h 48m 10s", "loss_scale": 1.0, "consumed_samples": 815360, "global_step/max_steps": "3185/6362"} +{"lm loss": 4.92581367, "grad_norm": 0.45392272, "learning_rate": 5.538e-05, "elapsed_time_per_iteration": 6.37039161, "memory(GiB)": 21.51, "elapsed_time": "5h 49m 9s", "remaining_time": "5h 48m 3s", "loss_scale": 1.0, "consumed_samples": 815616, "global_step/max_steps": "3186/6362"} +{"lm loss": 4.91520262, "grad_norm": 0.54935545, "learning_rate": 5.535e-05, "elapsed_time_per_iteration": 6.48253441, "memory(GiB)": 21.51, "elapsed_time": "5h 49m 16s", "remaining_time": "5h 47m 57s", "loss_scale": 1.0, "consumed_samples": 815872, "global_step/max_steps": "3187/6362"} +{"lm loss": 4.94758177, "grad_norm": 0.42877087, "learning_rate": 5.533e-05, "elapsed_time_per_iteration": 6.43305063, "memory(GiB)": 21.51, "elapsed_time": "5h 49m 22s", "remaining_time": "5h 47m 50s", "loss_scale": 1.0, "consumed_samples": 816128, "global_step/max_steps": "3188/6362"} +{"lm loss": 4.92944574, "grad_norm": 0.4545885, "learning_rate": 5.53e-05, "elapsed_time_per_iteration": 6.51435208, "memory(GiB)": 21.51, "elapsed_time": "5h 49m 28s", "remaining_time": "5h 47m 43s", "loss_scale": 1.0, "consumed_samples": 816384, "global_step/max_steps": "3189/6362"} +{"lm loss": 4.94508076, "grad_norm": 0.46167669, "learning_rate": 5.528e-05, "elapsed_time_per_iteration": 6.47565842, "memory(GiB)": 21.51, "elapsed_time": "5h 49m 35s", "remaining_time": "5h 47m 37s", "loss_scale": 1.0, "consumed_samples": 816640, "global_step/max_steps": "3190/6362"} +{"lm loss": 4.95263958, "grad_norm": 0.46429321, "learning_rate": 5.525e-05, "elapsed_time_per_iteration": 6.45476675, "memory(GiB)": 21.51, "elapsed_time": "5h 49m 41s", "remaining_time": "5h 47m 30s", "loss_scale": 1.0, "consumed_samples": 816896, "global_step/max_steps": "3191/6362"} +{"lm loss": 4.91235733, "grad_norm": 0.466299, "learning_rate": 5.523e-05, "elapsed_time_per_iteration": 6.64244652, "memory(GiB)": 21.51, "elapsed_time": "5h 49m 48s", "remaining_time": "5h 47m 23s", "loss_scale": 1.0, "consumed_samples": 817152, "global_step/max_steps": "3192/6362"} +{"lm loss": 4.90165615, "grad_norm": 0.46885145, "learning_rate": 5.52e-05, "elapsed_time_per_iteration": 6.61151838, "memory(GiB)": 21.51, "elapsed_time": "5h 49m 55s", "remaining_time": "5h 47m 17s", "loss_scale": 1.0, "consumed_samples": 817408, "global_step/max_steps": "3193/6362"} +{"lm loss": 4.90935564, "grad_norm": 0.47842705, "learning_rate": 5.518e-05, "elapsed_time_per_iteration": 6.59589982, "memory(GiB)": 21.51, "elapsed_time": "5h 50m 1s", "remaining_time": "5h 47m 10s", "loss_scale": 1.0, "consumed_samples": 817664, "global_step/max_steps": "3194/6362"} +{"lm loss": 4.95367479, "grad_norm": 0.47063795, "learning_rate": 5.515e-05, "elapsed_time_per_iteration": 6.63537908, "memory(GiB)": 21.51, "elapsed_time": "5h 50m 8s", "remaining_time": "5h 47m 4s", "loss_scale": 1.0, "consumed_samples": 817920, "global_step/max_steps": "3195/6362"} +{"lm loss": 4.95658255, "grad_norm": 0.55256158, "learning_rate": 5.513e-05, "elapsed_time_per_iteration": 6.47173858, "memory(GiB)": 21.51, "elapsed_time": "5h 50m 14s", "remaining_time": "5h 46m 57s", "loss_scale": 1.0, "consumed_samples": 818176, "global_step/max_steps": "3196/6362"} +{"lm loss": 4.94595718, "grad_norm": 0.62901437, "learning_rate": 5.51e-05, "elapsed_time_per_iteration": 6.4112494, "memory(GiB)": 21.51, "elapsed_time": "5h 50m 21s", "remaining_time": "5h 46m 50s", "loss_scale": 1.0, "consumed_samples": 818432, "global_step/max_steps": "3197/6362"} +{"lm loss": 4.9354744, "grad_norm": 0.67844266, "learning_rate": 5.508e-05, "elapsed_time_per_iteration": 6.5036149, "memory(GiB)": 21.51, "elapsed_time": "5h 50m 27s", "remaining_time": "5h 46m 44s", "loss_scale": 1.0, "consumed_samples": 818688, "global_step/max_steps": "3198/6362"} +{"lm loss": 4.93544388, "grad_norm": 0.58607006, "learning_rate": 5.505e-05, "elapsed_time_per_iteration": 6.54412174, "memory(GiB)": 21.51, "elapsed_time": "5h 50m 34s", "remaining_time": "5h 46m 37s", "loss_scale": 1.0, "consumed_samples": 818944, "global_step/max_steps": "3199/6362"} +{"lm loss": 4.92400885, "grad_norm": 0.52312231, "learning_rate": 5.503e-05, "elapsed_time_per_iteration": 6.35341454, "memory(GiB)": 21.51, "elapsed_time": "5h 50m 40s", "remaining_time": "5h 46m 30s", "loss_scale": 1.0, "consumed_samples": 819200, "global_step/max_steps": "3200/6362"} +{"lm loss": 4.91286325, "grad_norm": 0.52817756, "learning_rate": 5.5e-05, "elapsed_time_per_iteration": 6.39073539, "memory(GiB)": 21.51, "elapsed_time": "5h 50m 47s", "remaining_time": "5h 46m 24s", "loss_scale": 1.0, "consumed_samples": 819456, "global_step/max_steps": "3201/6362"} +{"lm loss": 4.93767929, "grad_norm": 0.58208841, "learning_rate": 5.498e-05, "elapsed_time_per_iteration": 6.58917356, "memory(GiB)": 21.51, "elapsed_time": "5h 50m 53s", "remaining_time": "5h 46m 17s", "loss_scale": 1.0, "consumed_samples": 819712, "global_step/max_steps": "3202/6362"} +{"lm loss": 4.93804979, "grad_norm": 0.54040074, "learning_rate": 5.495e-05, "elapsed_time_per_iteration": 6.84493804, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 0s", "remaining_time": "5h 46m 11s", "loss_scale": 1.0, "consumed_samples": 819968, "global_step/max_steps": "3203/6362"} +{"lm loss": 4.91514063, "grad_norm": 0.48969603, "learning_rate": 5.493e-05, "elapsed_time_per_iteration": 6.35172153, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 6s", "remaining_time": "5h 46m 4s", "loss_scale": 1.0, "consumed_samples": 820224, "global_step/max_steps": "3204/6362"} +{"lm loss": 4.92073822, "grad_norm": 0.47545755, "learning_rate": 5.49e-05, "elapsed_time_per_iteration": 6.47051144, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 13s", "remaining_time": "5h 45m 57s", "loss_scale": 1.0, "consumed_samples": 820480, "global_step/max_steps": "3205/6362"} +{"lm loss": 4.92480898, "grad_norm": 0.51763958, "learning_rate": 5.488e-05, "elapsed_time_per_iteration": 6.62738657, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 19s", "remaining_time": "5h 45m 51s", "loss_scale": 1.0, "consumed_samples": 820736, "global_step/max_steps": "3206/6362"} +{"lm loss": 4.90883875, "grad_norm": 0.54988974, "learning_rate": 5.485e-05, "elapsed_time_per_iteration": 6.62527418, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 26s", "remaining_time": "5h 45m 44s", "loss_scale": 1.0, "consumed_samples": 820992, "global_step/max_steps": "3207/6362"} +{"lm loss": 4.93903017, "grad_norm": 0.5255807, "learning_rate": 5.483e-05, "elapsed_time_per_iteration": 6.60365582, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 33s", "remaining_time": "5h 45m 38s", "loss_scale": 1.0, "consumed_samples": 821248, "global_step/max_steps": "3208/6362"} +{"lm loss": 4.93619633, "grad_norm": 0.50645667, "learning_rate": 5.48e-05, "elapsed_time_per_iteration": 6.35646629, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 39s", "remaining_time": "5h 45m 31s", "loss_scale": 1.0, "consumed_samples": 821504, "global_step/max_steps": "3209/6362"} +{"lm loss": 4.94266558, "grad_norm": 0.52056479, "learning_rate": 5.478e-05, "elapsed_time_per_iteration": 6.79728532, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 46s", "remaining_time": "5h 45m 24s", "loss_scale": 1.0, "consumed_samples": 821760, "global_step/max_steps": "3210/6362"} +{"lm loss": 4.93538713, "grad_norm": 0.51676875, "learning_rate": 5.475e-05, "elapsed_time_per_iteration": 6.56262994, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 52s", "remaining_time": "5h 45m 18s", "loss_scale": 1.0, "consumed_samples": 822016, "global_step/max_steps": "3211/6362"} +{"lm loss": 4.93593693, "grad_norm": 0.47053972, "learning_rate": 5.473e-05, "elapsed_time_per_iteration": 6.6459558, "memory(GiB)": 21.51, "elapsed_time": "5h 51m 59s", "remaining_time": "5h 45m 11s", "loss_scale": 1.0, "consumed_samples": 822272, "global_step/max_steps": "3212/6362"} +{"lm loss": 4.93500137, "grad_norm": 0.48004517, "learning_rate": 5.47e-05, "elapsed_time_per_iteration": 6.53046942, "memory(GiB)": 21.51, "elapsed_time": "5h 52m 6s", "remaining_time": "5h 45m 5s", "loss_scale": 1.0, "consumed_samples": 822528, "global_step/max_steps": "3213/6362"} +{"lm loss": 4.94551277, "grad_norm": 0.50027585, "learning_rate": 5.468e-05, "elapsed_time_per_iteration": 6.63745928, "memory(GiB)": 21.51, "elapsed_time": "5h 52m 12s", "remaining_time": "5h 44m 58s", "loss_scale": 1.0, "consumed_samples": 822784, "global_step/max_steps": "3214/6362"} +{"lm loss": 4.94894886, "grad_norm": 0.45648634, "learning_rate": 5.465e-05, "elapsed_time_per_iteration": 6.48947859, "memory(GiB)": 21.51, "elapsed_time": "5h 52m 19s", "remaining_time": "5h 44m 52s", "loss_scale": 1.0, "consumed_samples": 823040, "global_step/max_steps": "3215/6362"} +{"lm loss": 4.94769573, "grad_norm": 0.41953266, "learning_rate": 5.463e-05, "elapsed_time_per_iteration": 6.73532772, "memory(GiB)": 21.51, "elapsed_time": "5h 52m 25s", "remaining_time": "5h 44m 45s", "loss_scale": 1.0, "consumed_samples": 823296, "global_step/max_steps": "3216/6362"} +{"lm loss": 4.94399548, "grad_norm": 0.50111723, "learning_rate": 5.46e-05, "elapsed_time_per_iteration": 6.57833242, "memory(GiB)": 21.51, "elapsed_time": "5h 52m 32s", "remaining_time": "5h 44m 39s", "loss_scale": 1.0, "consumed_samples": 823552, "global_step/max_steps": "3217/6362"} +{"lm loss": 4.94882393, "grad_norm": 0.5816381, "learning_rate": 5.457e-05, "elapsed_time_per_iteration": 6.57486367, "memory(GiB)": 21.51, "elapsed_time": "5h 52m 39s", "remaining_time": "5h 44m 32s", "loss_scale": 1.0, "consumed_samples": 823808, "global_step/max_steps": "3218/6362"} +{"lm loss": 4.93247843, "grad_norm": 0.54949677, "learning_rate": 5.455e-05, "elapsed_time_per_iteration": 6.68122435, "memory(GiB)": 21.51, "elapsed_time": "5h 52m 45s", "remaining_time": "5h 44m 26s", "loss_scale": 1.0, "consumed_samples": 824064, "global_step/max_steps": "3219/6362"} +{"lm loss": 4.94268036, "grad_norm": 0.45042554, "learning_rate": 5.452e-05, "elapsed_time_per_iteration": 6.38575339, "memory(GiB)": 21.51, "elapsed_time": "5h 52m 52s", "remaining_time": "5h 44m 19s", "loss_scale": 1.0, "consumed_samples": 824320, "global_step/max_steps": "3220/6362"} +{"lm loss": 4.92236757, "grad_norm": 0.49935439, "learning_rate": 5.45e-05, "elapsed_time_per_iteration": 6.4527688, "memory(GiB)": 21.51, "elapsed_time": "5h 52m 58s", "remaining_time": "5h 44m 12s", "loss_scale": 1.0, "consumed_samples": 824576, "global_step/max_steps": "3221/6362"} +{"lm loss": 4.94643259, "grad_norm": 0.53297603, "learning_rate": 5.447e-05, "elapsed_time_per_iteration": 6.57758856, "memory(GiB)": 21.51, "elapsed_time": "5h 53m 5s", "remaining_time": "5h 44m 6s", "loss_scale": 1.0, "consumed_samples": 824832, "global_step/max_steps": "3222/6362"} +{"lm loss": 4.93044806, "grad_norm": 0.49115461, "learning_rate": 5.445e-05, "elapsed_time_per_iteration": 6.55470705, "memory(GiB)": 21.51, "elapsed_time": "5h 53m 11s", "remaining_time": "5h 43m 59s", "loss_scale": 1.0, "consumed_samples": 825088, "global_step/max_steps": "3223/6362"} +{"lm loss": 4.94367552, "grad_norm": 0.56529987, "learning_rate": 5.442e-05, "elapsed_time_per_iteration": 6.49713182, "memory(GiB)": 21.51, "elapsed_time": "5h 53m 18s", "remaining_time": "5h 43m 52s", "loss_scale": 1.0, "consumed_samples": 825344, "global_step/max_steps": "3224/6362"} +{"lm loss": 4.9273634, "grad_norm": 0.63767844, "learning_rate": 5.44e-05, "elapsed_time_per_iteration": 6.62358379, "memory(GiB)": 21.51, "elapsed_time": "5h 53m 24s", "remaining_time": "5h 43m 46s", "loss_scale": 1.0, "consumed_samples": 825600, "global_step/max_steps": "3225/6362"} +{"lm loss": 4.92746782, "grad_norm": 0.60180998, "learning_rate": 5.437e-05, "elapsed_time_per_iteration": 6.44928026, "memory(GiB)": 21.51, "elapsed_time": "5h 53m 31s", "remaining_time": "5h 43m 39s", "loss_scale": 1.0, "consumed_samples": 825856, "global_step/max_steps": "3226/6362"} +{"lm loss": 4.91269922, "grad_norm": 0.53637093, "learning_rate": 5.435e-05, "elapsed_time_per_iteration": 6.46646881, "memory(GiB)": 21.51, "elapsed_time": "5h 53m 37s", "remaining_time": "5h 43m 32s", "loss_scale": 1.0, "consumed_samples": 826112, "global_step/max_steps": "3227/6362"} +{"lm loss": 4.94397545, "grad_norm": 0.51039904, "learning_rate": 5.432e-05, "elapsed_time_per_iteration": 6.63767838, "memory(GiB)": 21.51, "elapsed_time": "5h 53m 44s", "remaining_time": "5h 43m 26s", "loss_scale": 1.0, "consumed_samples": 826368, "global_step/max_steps": "3228/6362"} +{"lm loss": 4.90694427, "grad_norm": 0.50456774, "learning_rate": 5.43e-05, "elapsed_time_per_iteration": 6.33472133, "memory(GiB)": 21.51, "elapsed_time": "5h 53m 50s", "remaining_time": "5h 43m 19s", "loss_scale": 1.0, "consumed_samples": 826624, "global_step/max_steps": "3229/6362"} +{"lm loss": 4.94355202, "grad_norm": 0.53338265, "learning_rate": 5.427e-05, "elapsed_time_per_iteration": 6.64600348, "memory(GiB)": 21.51, "elapsed_time": "5h 53m 57s", "remaining_time": "5h 43m 13s", "loss_scale": 1.0, "consumed_samples": 826880, "global_step/max_steps": "3230/6362"} +{"lm loss": 4.93920803, "grad_norm": 0.50476474, "learning_rate": 5.425e-05, "elapsed_time_per_iteration": 6.56064773, "memory(GiB)": 21.51, "elapsed_time": "5h 54m 3s", "remaining_time": "5h 43m 6s", "loss_scale": 1.0, "consumed_samples": 827136, "global_step/max_steps": "3231/6362"} +{"lm loss": 4.93187809, "grad_norm": 0.48617941, "learning_rate": 5.422e-05, "elapsed_time_per_iteration": 6.64201617, "memory(GiB)": 21.51, "elapsed_time": "5h 54m 10s", "remaining_time": "5h 42m 59s", "loss_scale": 1.0, "consumed_samples": 827392, "global_step/max_steps": "3232/6362"} +{"lm loss": 4.94420338, "grad_norm": 0.53634602, "learning_rate": 5.42e-05, "elapsed_time_per_iteration": 6.673733, "memory(GiB)": 21.51, "elapsed_time": "5h 54m 17s", "remaining_time": "5h 42m 53s", "loss_scale": 1.0, "consumed_samples": 827648, "global_step/max_steps": "3233/6362"} +{"lm loss": 4.94387293, "grad_norm": 0.52989656, "learning_rate": 5.417e-05, "elapsed_time_per_iteration": 6.68092489, "memory(GiB)": 21.51, "elapsed_time": "5h 54m 23s", "remaining_time": "5h 42m 46s", "loss_scale": 1.0, "consumed_samples": 827904, "global_step/max_steps": "3234/6362"} +{"lm loss": 4.945364, "grad_norm": 0.51728296, "learning_rate": 5.415e-05, "elapsed_time_per_iteration": 6.67179918, "memory(GiB)": 21.51, "elapsed_time": "5h 54m 30s", "remaining_time": "5h 42m 40s", "loss_scale": 1.0, "consumed_samples": 828160, "global_step/max_steps": "3235/6362"} +{"lm loss": 4.94091988, "grad_norm": 0.52019775, "learning_rate": 5.412e-05, "elapsed_time_per_iteration": 6.59665728, "memory(GiB)": 21.51, "elapsed_time": "5h 54m 37s", "remaining_time": "5h 42m 33s", "loss_scale": 1.0, "consumed_samples": 828416, "global_step/max_steps": "3236/6362"} +{"lm loss": 4.93257189, "grad_norm": 0.51711178, "learning_rate": 5.41e-05, "elapsed_time_per_iteration": 6.63985491, "memory(GiB)": 21.51, "elapsed_time": "5h 54m 43s", "remaining_time": "5h 42m 27s", "loss_scale": 1.0, "consumed_samples": 828672, "global_step/max_steps": "3237/6362"} +{"lm loss": 4.9170413, "grad_norm": 0.50027955, "learning_rate": 5.407e-05, "elapsed_time_per_iteration": 6.51436186, "memory(GiB)": 21.51, "elapsed_time": "5h 54m 50s", "remaining_time": "5h 42m 20s", "loss_scale": 1.0, "consumed_samples": 828928, "global_step/max_steps": "3238/6362"} +{"lm loss": 4.941185, "grad_norm": 0.56173974, "learning_rate": 5.405e-05, "elapsed_time_per_iteration": 6.86731887, "memory(GiB)": 21.51, "elapsed_time": "5h 54m 57s", "remaining_time": "5h 42m 14s", "loss_scale": 1.0, "consumed_samples": 829184, "global_step/max_steps": "3239/6362"} +{"lm loss": 4.91966629, "grad_norm": 0.59397417, "learning_rate": 5.402e-05, "elapsed_time_per_iteration": 6.62000918, "memory(GiB)": 21.51, "elapsed_time": "5h 55m 3s", "remaining_time": "5h 42m 7s", "loss_scale": 1.0, "consumed_samples": 829440, "global_step/max_steps": "3240/6362"} +{"lm loss": 4.94989014, "grad_norm": 0.52119678, "learning_rate": 5.4e-05, "elapsed_time_per_iteration": 6.6431067, "memory(GiB)": 21.51, "elapsed_time": "5h 55m 10s", "remaining_time": "5h 42m 1s", "loss_scale": 1.0, "consumed_samples": 829696, "global_step/max_steps": "3241/6362"} +{"lm loss": 4.92042303, "grad_norm": 0.49134818, "learning_rate": 5.397e-05, "elapsed_time_per_iteration": 6.70377731, "memory(GiB)": 21.51, "elapsed_time": "5h 55m 17s", "remaining_time": "5h 41m 55s", "loss_scale": 1.0, "consumed_samples": 829952, "global_step/max_steps": "3242/6362"} +{"lm loss": 4.93968725, "grad_norm": 0.59626448, "learning_rate": 5.395e-05, "elapsed_time_per_iteration": 6.51898646, "memory(GiB)": 21.51, "elapsed_time": "5h 55m 23s", "remaining_time": "5h 41m 48s", "loss_scale": 1.0, "consumed_samples": 830208, "global_step/max_steps": "3243/6362"} +{"lm loss": 4.95638895, "grad_norm": 0.51759291, "learning_rate": 5.392e-05, "elapsed_time_per_iteration": 6.73742437, "memory(GiB)": 21.51, "elapsed_time": "5h 55m 30s", "remaining_time": "5h 41m 41s", "loss_scale": 1.0, "consumed_samples": 830464, "global_step/max_steps": "3244/6362"} +{"lm loss": 4.92832184, "grad_norm": 0.51449507, "learning_rate": 5.39e-05, "elapsed_time_per_iteration": 6.70507622, "memory(GiB)": 21.51, "elapsed_time": "5h 55m 37s", "remaining_time": "5h 41m 35s", "loss_scale": 1.0, "consumed_samples": 830720, "global_step/max_steps": "3245/6362"} +{"lm loss": 4.92920208, "grad_norm": 0.61229575, "learning_rate": 5.387e-05, "elapsed_time_per_iteration": 6.54276681, "memory(GiB)": 21.51, "elapsed_time": "5h 55m 43s", "remaining_time": "5h 41m 28s", "loss_scale": 1.0, "consumed_samples": 830976, "global_step/max_steps": "3246/6362"} +{"lm loss": 4.93224621, "grad_norm": 0.57427555, "learning_rate": 5.384e-05, "elapsed_time_per_iteration": 6.60800529, "memory(GiB)": 21.51, "elapsed_time": "5h 55m 50s", "remaining_time": "5h 41m 22s", "loss_scale": 1.0, "consumed_samples": 831232, "global_step/max_steps": "3247/6362"} +{"lm loss": 4.94654131, "grad_norm": 0.53950077, "learning_rate": 5.382e-05, "elapsed_time_per_iteration": 6.67288089, "memory(GiB)": 21.51, "elapsed_time": "5h 55m 56s", "remaining_time": "5h 41m 15s", "loss_scale": 1.0, "consumed_samples": 831488, "global_step/max_steps": "3248/6362"} +{"lm loss": 4.90968227, "grad_norm": 0.57975328, "learning_rate": 5.379e-05, "elapsed_time_per_iteration": 6.90096855, "memory(GiB)": 21.51, "elapsed_time": "5h 56m 3s", "remaining_time": "5h 41m 9s", "loss_scale": 1.0, "consumed_samples": 831744, "global_step/max_steps": "3249/6362"} +{"lm loss": 4.9308629, "grad_norm": 0.58576035, "learning_rate": 5.377e-05, "elapsed_time_per_iteration": 6.7175498, "memory(GiB)": 21.51, "elapsed_time": "5h 56m 10s", "remaining_time": "5h 41m 3s", "loss_scale": 1.0, "consumed_samples": 832000, "global_step/max_steps": "3250/6362"} +{"lm loss": 4.93368101, "grad_norm": 0.59187567, "learning_rate": 5.374e-05, "elapsed_time_per_iteration": 6.80685949, "memory(GiB)": 21.51, "elapsed_time": "5h 56m 17s", "remaining_time": "5h 40m 56s", "loss_scale": 1.0, "consumed_samples": 832256, "global_step/max_steps": "3251/6362"} +{"lm loss": 4.92728233, "grad_norm": 0.51845944, "learning_rate": 5.372e-05, "elapsed_time_per_iteration": 6.66191149, "memory(GiB)": 21.51, "elapsed_time": "5h 56m 24s", "remaining_time": "5h 40m 50s", "loss_scale": 1.0, "consumed_samples": 832512, "global_step/max_steps": "3252/6362"} +{"lm loss": 4.93957043, "grad_norm": 0.4573079, "learning_rate": 5.369e-05, "elapsed_time_per_iteration": 6.6738832, "memory(GiB)": 21.51, "elapsed_time": "5h 56m 30s", "remaining_time": "5h 40m 43s", "loss_scale": 1.0, "consumed_samples": 832768, "global_step/max_steps": "3253/6362"} +{"lm loss": 4.93962765, "grad_norm": 0.54649693, "learning_rate": 5.367e-05, "elapsed_time_per_iteration": 6.7238183, "memory(GiB)": 21.51, "elapsed_time": "5h 56m 37s", "remaining_time": "5h 40m 37s", "loss_scale": 1.0, "consumed_samples": 833024, "global_step/max_steps": "3254/6362"} +{"lm loss": 4.92197323, "grad_norm": 0.50260067, "learning_rate": 5.364e-05, "elapsed_time_per_iteration": 6.58195972, "memory(GiB)": 21.51, "elapsed_time": "5h 56m 44s", "remaining_time": "5h 40m 30s", "loss_scale": 1.0, "consumed_samples": 833280, "global_step/max_steps": "3255/6362"} +{"lm loss": 4.93898582, "grad_norm": 0.46848184, "learning_rate": 5.362e-05, "elapsed_time_per_iteration": 6.58036494, "memory(GiB)": 21.51, "elapsed_time": "5h 56m 50s", "remaining_time": "5h 40m 24s", "loss_scale": 1.0, "consumed_samples": 833536, "global_step/max_steps": "3256/6362"} +{"lm loss": 4.89997673, "grad_norm": 0.55490232, "learning_rate": 5.359e-05, "elapsed_time_per_iteration": 6.49837017, "memory(GiB)": 21.51, "elapsed_time": "5h 56m 57s", "remaining_time": "5h 40m 17s", "loss_scale": 1.0, "consumed_samples": 833792, "global_step/max_steps": "3257/6362"} +{"lm loss": 4.90889692, "grad_norm": 0.55389696, "learning_rate": 5.357e-05, "elapsed_time_per_iteration": 6.77105999, "memory(GiB)": 21.51, "elapsed_time": "5h 57m 3s", "remaining_time": "5h 40m 11s", "loss_scale": 1.0, "consumed_samples": 834048, "global_step/max_steps": "3258/6362"} +{"lm loss": 4.95189095, "grad_norm": 0.46043813, "learning_rate": 5.354e-05, "elapsed_time_per_iteration": 6.61598277, "memory(GiB)": 21.51, "elapsed_time": "5h 57m 10s", "remaining_time": "5h 40m 4s", "loss_scale": 1.0, "consumed_samples": 834304, "global_step/max_steps": "3259/6362"} +{"lm loss": 4.91461134, "grad_norm": 0.57329881, "learning_rate": 5.352e-05, "elapsed_time_per_iteration": 6.57030416, "memory(GiB)": 21.51, "elapsed_time": "5h 57m 17s", "remaining_time": "5h 39m 58s", "loss_scale": 1.0, "consumed_samples": 834560, "global_step/max_steps": "3260/6362"} +{"lm loss": 4.94529963, "grad_norm": 0.56965548, "learning_rate": 5.349e-05, "elapsed_time_per_iteration": 6.58289981, "memory(GiB)": 21.51, "elapsed_time": "5h 57m 23s", "remaining_time": "5h 39m 51s", "loss_scale": 1.0, "consumed_samples": 834816, "global_step/max_steps": "3261/6362"} +{"lm loss": 4.92984915, "grad_norm": 0.49801096, "learning_rate": 5.347e-05, "elapsed_time_per_iteration": 6.59891725, "memory(GiB)": 21.51, "elapsed_time": "5h 57m 30s", "remaining_time": "5h 39m 44s", "loss_scale": 1.0, "consumed_samples": 835072, "global_step/max_steps": "3262/6362"} +{"lm loss": 4.95083094, "grad_norm": 0.51809824, "learning_rate": 5.344e-05, "elapsed_time_per_iteration": 6.39601946, "memory(GiB)": 21.51, "elapsed_time": "5h 57m 36s", "remaining_time": "5h 39m 38s", "loss_scale": 1.0, "consumed_samples": 835328, "global_step/max_steps": "3263/6362"} +{"lm loss": 4.96418381, "grad_norm": 0.46679854, "learning_rate": 5.342e-05, "elapsed_time_per_iteration": 6.62322736, "memory(GiB)": 21.51, "elapsed_time": "5h 57m 43s", "remaining_time": "5h 39m 31s", "loss_scale": 1.0, "consumed_samples": 835584, "global_step/max_steps": "3264/6362"} +{"lm loss": 4.93833017, "grad_norm": 0.54504746, "learning_rate": 5.339e-05, "elapsed_time_per_iteration": 6.36788559, "memory(GiB)": 21.51, "elapsed_time": "5h 57m 49s", "remaining_time": "5h 39m 24s", "loss_scale": 1.0, "consumed_samples": 835840, "global_step/max_steps": "3265/6362"} +{"lm loss": 4.93488312, "grad_norm": 0.5577786, "learning_rate": 5.337e-05, "elapsed_time_per_iteration": 6.56203651, "memory(GiB)": 21.51, "elapsed_time": "5h 57m 56s", "remaining_time": "5h 39m 18s", "loss_scale": 1.0, "consumed_samples": 836096, "global_step/max_steps": "3266/6362"} +{"lm loss": 4.91659212, "grad_norm": 0.51857173, "learning_rate": 5.334e-05, "elapsed_time_per_iteration": 6.49418926, "memory(GiB)": 21.51, "elapsed_time": "5h 58m 2s", "remaining_time": "5h 39m 11s", "loss_scale": 1.0, "consumed_samples": 836352, "global_step/max_steps": "3267/6362"} +{"lm loss": 4.93498182, "grad_norm": 0.47470623, "learning_rate": 5.332e-05, "elapsed_time_per_iteration": 6.49756384, "memory(GiB)": 21.51, "elapsed_time": "5h 58m 9s", "remaining_time": "5h 39m 5s", "loss_scale": 1.0, "consumed_samples": 836608, "global_step/max_steps": "3268/6362"} +{"lm loss": 4.96592474, "grad_norm": 0.46959472, "learning_rate": 5.329e-05, "elapsed_time_per_iteration": 6.54964304, "memory(GiB)": 21.51, "elapsed_time": "5h 58m 15s", "remaining_time": "5h 38m 58s", "loss_scale": 1.0, "consumed_samples": 836864, "global_step/max_steps": "3269/6362"} +{"lm loss": 4.91983747, "grad_norm": 0.50611436, "learning_rate": 5.327e-05, "elapsed_time_per_iteration": 6.65536332, "memory(GiB)": 21.51, "elapsed_time": "5h 58m 22s", "remaining_time": "5h 38m 51s", "loss_scale": 1.0, "consumed_samples": 837120, "global_step/max_steps": "3270/6362"} +{"lm loss": 4.91465759, "grad_norm": 0.50469065, "learning_rate": 5.324e-05, "elapsed_time_per_iteration": 6.71241212, "memory(GiB)": 21.51, "elapsed_time": "5h 58m 29s", "remaining_time": "5h 38m 45s", "loss_scale": 1.0, "consumed_samples": 837376, "global_step/max_steps": "3271/6362"} +{"lm loss": 4.92183065, "grad_norm": 0.4652082, "learning_rate": 5.322e-05, "elapsed_time_per_iteration": 6.46232414, "memory(GiB)": 21.51, "elapsed_time": "5h 58m 35s", "remaining_time": "5h 38m 38s", "loss_scale": 1.0, "consumed_samples": 837632, "global_step/max_steps": "3272/6362"} +{"lm loss": 4.90898895, "grad_norm": 0.54607451, "learning_rate": 5.319e-05, "elapsed_time_per_iteration": 6.68635201, "memory(GiB)": 21.51, "elapsed_time": "5h 58m 42s", "remaining_time": "5h 38m 32s", "loss_scale": 1.0, "consumed_samples": 837888, "global_step/max_steps": "3273/6362"} +{"lm loss": 4.90368366, "grad_norm": 0.47319937, "learning_rate": 5.316e-05, "elapsed_time_per_iteration": 6.66331959, "memory(GiB)": 21.51, "elapsed_time": "5h 58m 48s", "remaining_time": "5h 38m 25s", "loss_scale": 1.0, "consumed_samples": 838144, "global_step/max_steps": "3274/6362"} +{"lm loss": 4.9268589, "grad_norm": 0.50799906, "learning_rate": 5.314e-05, "elapsed_time_per_iteration": 6.39364648, "memory(GiB)": 21.51, "elapsed_time": "5h 58m 55s", "remaining_time": "5h 38m 19s", "loss_scale": 1.0, "consumed_samples": 838400, "global_step/max_steps": "3275/6362"} +{"lm loss": 4.92348909, "grad_norm": 0.45642, "learning_rate": 5.311e-05, "elapsed_time_per_iteration": 6.46221757, "memory(GiB)": 21.51, "elapsed_time": "5h 59m 1s", "remaining_time": "5h 38m 12s", "loss_scale": 1.0, "consumed_samples": 838656, "global_step/max_steps": "3276/6362"} +{"lm loss": 4.93383646, "grad_norm": 0.50409019, "learning_rate": 5.309e-05, "elapsed_time_per_iteration": 6.87827134, "memory(GiB)": 21.51, "elapsed_time": "5h 59m 8s", "remaining_time": "5h 38m 6s", "loss_scale": 1.0, "consumed_samples": 838912, "global_step/max_steps": "3277/6362"} +{"lm loss": 4.90193558, "grad_norm": 0.53272039, "learning_rate": 5.306e-05, "elapsed_time_per_iteration": 6.59450006, "memory(GiB)": 21.51, "elapsed_time": "5h 59m 15s", "remaining_time": "5h 37m 59s", "loss_scale": 1.0, "consumed_samples": 839168, "global_step/max_steps": "3278/6362"} +{"lm loss": 4.92857599, "grad_norm": 0.57500774, "learning_rate": 5.304e-05, "elapsed_time_per_iteration": 6.56255579, "memory(GiB)": 21.51, "elapsed_time": "5h 59m 21s", "remaining_time": "5h 37m 52s", "loss_scale": 1.0, "consumed_samples": 839424, "global_step/max_steps": "3279/6362"} +{"lm loss": 4.91590357, "grad_norm": 0.64547729, "learning_rate": 5.301e-05, "elapsed_time_per_iteration": 6.4532795, "memory(GiB)": 21.51, "elapsed_time": "5h 59m 28s", "remaining_time": "5h 37m 46s", "loss_scale": 1.0, "consumed_samples": 839680, "global_step/max_steps": "3280/6362"} +{"lm loss": 4.93010998, "grad_norm": 0.69094205, "learning_rate": 5.299e-05, "elapsed_time_per_iteration": 6.29150867, "memory(GiB)": 21.51, "elapsed_time": "5h 59m 34s", "remaining_time": "5h 37m 39s", "loss_scale": 1.0, "consumed_samples": 839936, "global_step/max_steps": "3281/6362"} +{"lm loss": 4.91861153, "grad_norm": 0.53376704, "learning_rate": 5.296e-05, "elapsed_time_per_iteration": 6.54893994, "memory(GiB)": 21.51, "elapsed_time": "5h 59m 41s", "remaining_time": "5h 37m 32s", "loss_scale": 1.0, "consumed_samples": 840192, "global_step/max_steps": "3282/6362"} +{"lm loss": 4.93088913, "grad_norm": 0.5086745, "learning_rate": 5.294e-05, "elapsed_time_per_iteration": 6.53608394, "memory(GiB)": 21.51, "elapsed_time": "5h 59m 47s", "remaining_time": "5h 37m 26s", "loss_scale": 1.0, "consumed_samples": 840448, "global_step/max_steps": "3283/6362"} +{"lm loss": 4.92816019, "grad_norm": 0.51849085, "learning_rate": 5.291e-05, "elapsed_time_per_iteration": 7.26837492, "memory(GiB)": 21.51, "elapsed_time": "5h 59m 54s", "remaining_time": "5h 37m 20s", "loss_scale": 1.0, "consumed_samples": 840704, "global_step/max_steps": "3284/6362"} +{"lm loss": 4.92345905, "grad_norm": 0.4656944, "learning_rate": 5.289e-05, "elapsed_time_per_iteration": 6.41254902, "memory(GiB)": 21.51, "elapsed_time": "6h 0m 1s", "remaining_time": "5h 37m 13s", "loss_scale": 1.0, "consumed_samples": 840960, "global_step/max_steps": "3285/6362"} +{"lm loss": 4.95316076, "grad_norm": 0.48835224, "learning_rate": 5.286e-05, "elapsed_time_per_iteration": 6.63883162, "memory(GiB)": 21.51, "elapsed_time": "6h 0m 7s", "remaining_time": "5h 37m 7s", "loss_scale": 1.0, "consumed_samples": 841216, "global_step/max_steps": "3286/6362"} +{"lm loss": 4.94657326, "grad_norm": 0.52297717, "learning_rate": 5.284e-05, "elapsed_time_per_iteration": 6.78075266, "memory(GiB)": 21.51, "elapsed_time": "6h 0m 14s", "remaining_time": "5h 37m 0s", "loss_scale": 1.0, "consumed_samples": 841472, "global_step/max_steps": "3287/6362"} +{"lm loss": 4.91358328, "grad_norm": 0.46591687, "learning_rate": 5.281e-05, "elapsed_time_per_iteration": 6.85228777, "memory(GiB)": 21.51, "elapsed_time": "6h 0m 21s", "remaining_time": "5h 36m 54s", "loss_scale": 1.0, "consumed_samples": 841728, "global_step/max_steps": "3288/6362"} +{"lm loss": 4.9244833, "grad_norm": 0.49709833, "learning_rate": 5.279e-05, "elapsed_time_per_iteration": 6.57018924, "memory(GiB)": 21.51, "elapsed_time": "6h 0m 28s", "remaining_time": "5h 36m 47s", "loss_scale": 1.0, "consumed_samples": 841984, "global_step/max_steps": "3289/6362"} +{"lm loss": 4.93873644, "grad_norm": 0.48527572, "learning_rate": 5.276e-05, "elapsed_time_per_iteration": 6.63771343, "memory(GiB)": 21.51, "elapsed_time": "6h 0m 34s", "remaining_time": "5h 36m 41s", "loss_scale": 1.0, "consumed_samples": 842240, "global_step/max_steps": "3290/6362"} +{"lm loss": 4.9271965, "grad_norm": 0.46732315, "learning_rate": 5.274e-05, "elapsed_time_per_iteration": 6.47541428, "memory(GiB)": 21.51, "elapsed_time": "6h 0m 41s", "remaining_time": "5h 36m 34s", "loss_scale": 1.0, "consumed_samples": 842496, "global_step/max_steps": "3291/6362"} +{"lm loss": 4.89718342, "grad_norm": 0.43879968, "learning_rate": 5.271e-05, "elapsed_time_per_iteration": 6.40344715, "memory(GiB)": 21.51, "elapsed_time": "6h 0m 47s", "remaining_time": "5h 36m 27s", "loss_scale": 1.0, "consumed_samples": 842752, "global_step/max_steps": "3292/6362"} +{"lm loss": 4.92291594, "grad_norm": 0.46627229, "learning_rate": 5.269e-05, "elapsed_time_per_iteration": 6.42692447, "memory(GiB)": 21.51, "elapsed_time": "6h 0m 54s", "remaining_time": "5h 36m 21s", "loss_scale": 1.0, "consumed_samples": 843008, "global_step/max_steps": "3293/6362"} +{"lm loss": 4.92773867, "grad_norm": 0.48439804, "learning_rate": 5.266e-05, "elapsed_time_per_iteration": 6.69313335, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 0s", "remaining_time": "5h 36m 14s", "loss_scale": 1.0, "consumed_samples": 843264, "global_step/max_steps": "3294/6362"} +{"lm loss": 4.91483164, "grad_norm": 0.528584, "learning_rate": 5.264e-05, "elapsed_time_per_iteration": 6.61208272, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 7s", "remaining_time": "5h 36m 8s", "loss_scale": 1.0, "consumed_samples": 843520, "global_step/max_steps": "3295/6362"} +{"lm loss": 4.92805004, "grad_norm": 0.47787791, "learning_rate": 5.261e-05, "elapsed_time_per_iteration": 6.49687338, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 13s", "remaining_time": "5h 36m 1s", "loss_scale": 1.0, "consumed_samples": 843776, "global_step/max_steps": "3296/6362"} +{"lm loss": 4.93493176, "grad_norm": 0.48140359, "learning_rate": 5.259e-05, "elapsed_time_per_iteration": 6.46613121, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 20s", "remaining_time": "5h 35m 54s", "loss_scale": 1.0, "consumed_samples": 844032, "global_step/max_steps": "3297/6362"} +{"lm loss": 4.93564415, "grad_norm": 0.54667956, "learning_rate": 5.256e-05, "elapsed_time_per_iteration": 6.43872261, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 26s", "remaining_time": "5h 35m 48s", "loss_scale": 1.0, "consumed_samples": 844288, "global_step/max_steps": "3298/6362"} +{"lm loss": 4.94216156, "grad_norm": 0.63263196, "learning_rate": 5.253e-05, "elapsed_time_per_iteration": 6.32459307, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 33s", "remaining_time": "5h 35m 41s", "loss_scale": 1.0, "consumed_samples": 844544, "global_step/max_steps": "3299/6362"} +{"lm loss": 4.91529751, "grad_norm": 0.57738352, "learning_rate": 5.251e-05, "elapsed_time_per_iteration": 6.63535309, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 39s", "remaining_time": "5h 35m 34s", "loss_scale": 1.0, "consumed_samples": 844800, "global_step/max_steps": "3300/6362"} +{"lm loss": 4.91960907, "grad_norm": 0.48938462, "learning_rate": 5.248e-05, "elapsed_time_per_iteration": 6.72908473, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 46s", "remaining_time": "5h 35m 28s", "loss_scale": 1.0, "consumed_samples": 845056, "global_step/max_steps": "3301/6362"} +{"lm loss": 4.93427801, "grad_norm": 0.48311675, "learning_rate": 5.246e-05, "elapsed_time_per_iteration": 6.49405837, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 53s", "remaining_time": "5h 35m 21s", "loss_scale": 1.0, "consumed_samples": 845312, "global_step/max_steps": "3302/6362"} +{"lm loss": 4.92946911, "grad_norm": 0.45408854, "learning_rate": 5.243e-05, "elapsed_time_per_iteration": 6.51032257, "memory(GiB)": 21.51, "elapsed_time": "6h 1m 59s", "remaining_time": "5h 35m 15s", "loss_scale": 1.0, "consumed_samples": 845568, "global_step/max_steps": "3303/6362"} +{"lm loss": 4.95097065, "grad_norm": 0.47646171, "learning_rate": 5.241e-05, "elapsed_time_per_iteration": 6.2854948, "memory(GiB)": 21.51, "elapsed_time": "6h 2m 5s", "remaining_time": "5h 35m 8s", "loss_scale": 1.0, "consumed_samples": 845824, "global_step/max_steps": "3304/6362"} +{"lm loss": 4.91908979, "grad_norm": 0.44358787, "learning_rate": 5.238e-05, "elapsed_time_per_iteration": 6.67051816, "memory(GiB)": 21.51, "elapsed_time": "6h 2m 12s", "remaining_time": "5h 35m 1s", "loss_scale": 1.0, "consumed_samples": 846080, "global_step/max_steps": "3305/6362"} +{"lm loss": 4.9213357, "grad_norm": 0.41092861, "learning_rate": 5.236e-05, "elapsed_time_per_iteration": 6.46798253, "memory(GiB)": 21.51, "elapsed_time": "6h 2m 18s", "remaining_time": "5h 34m 55s", "loss_scale": 1.0, "consumed_samples": 846336, "global_step/max_steps": "3306/6362"} +{"lm loss": 4.91778946, "grad_norm": 0.43988964, "learning_rate": 5.233e-05, "elapsed_time_per_iteration": 6.44871759, "memory(GiB)": 21.51, "elapsed_time": "6h 2m 25s", "remaining_time": "5h 34m 48s", "loss_scale": 1.0, "consumed_samples": 846592, "global_step/max_steps": "3307/6362"} +{"lm loss": 4.94934511, "grad_norm": 0.49438772, "learning_rate": 5.231e-05, "elapsed_time_per_iteration": 6.42370152, "memory(GiB)": 21.51, "elapsed_time": "6h 2m 31s", "remaining_time": "5h 34m 41s", "loss_scale": 1.0, "consumed_samples": 846848, "global_step/max_steps": "3308/6362"} +{"lm loss": 4.92092705, "grad_norm": 0.52780312, "learning_rate": 5.228e-05, "elapsed_time_per_iteration": 6.40746069, "memory(GiB)": 21.51, "elapsed_time": "6h 2m 38s", "remaining_time": "5h 34m 34s", "loss_scale": 1.0, "consumed_samples": 847104, "global_step/max_steps": "3309/6362"} +{"lm loss": 4.92830086, "grad_norm": 0.55444044, "learning_rate": 5.226e-05, "elapsed_time_per_iteration": 6.45005965, "memory(GiB)": 21.51, "elapsed_time": "6h 2m 44s", "remaining_time": "5h 34m 28s", "loss_scale": 1.0, "consumed_samples": 847360, "global_step/max_steps": "3310/6362"} +{"lm loss": 4.93609428, "grad_norm": 0.53875768, "learning_rate": 5.223e-05, "elapsed_time_per_iteration": 6.35215569, "memory(GiB)": 21.51, "elapsed_time": "6h 2m 51s", "remaining_time": "5h 34m 21s", "loss_scale": 1.0, "consumed_samples": 847616, "global_step/max_steps": "3311/6362"} +{"lm loss": 4.94174671, "grad_norm": 0.49039751, "learning_rate": 5.221e-05, "elapsed_time_per_iteration": 6.66496587, "memory(GiB)": 21.51, "elapsed_time": "6h 2m 57s", "remaining_time": "5h 34m 14s", "loss_scale": 1.0, "consumed_samples": 847872, "global_step/max_steps": "3312/6362"} +{"lm loss": 4.9090004, "grad_norm": 0.48342901, "learning_rate": 5.218e-05, "elapsed_time_per_iteration": 6.62216997, "memory(GiB)": 21.51, "elapsed_time": "6h 3m 4s", "remaining_time": "5h 34m 8s", "loss_scale": 1.0, "consumed_samples": 848128, "global_step/max_steps": "3313/6362"} +{"lm loss": 4.9485178, "grad_norm": 0.48760015, "learning_rate": 5.216e-05, "elapsed_time_per_iteration": 6.61952186, "memory(GiB)": 21.51, "elapsed_time": "6h 3m 10s", "remaining_time": "5h 34m 1s", "loss_scale": 1.0, "consumed_samples": 848384, "global_step/max_steps": "3314/6362"} +{"lm loss": 4.90829325, "grad_norm": 0.50501728, "learning_rate": 5.213e-05, "elapsed_time_per_iteration": 6.7371943, "memory(GiB)": 21.51, "elapsed_time": "6h 3m 17s", "remaining_time": "5h 33m 55s", "loss_scale": 1.0, "consumed_samples": 848640, "global_step/max_steps": "3315/6362"} +{"lm loss": 4.91698217, "grad_norm": 0.55835456, "learning_rate": 5.211e-05, "elapsed_time_per_iteration": 6.52150536, "memory(GiB)": 21.51, "elapsed_time": "6h 3m 24s", "remaining_time": "5h 33m 48s", "loss_scale": 1.0, "consumed_samples": 848896, "global_step/max_steps": "3316/6362"} +{"lm loss": 4.92100239, "grad_norm": 0.49527019, "learning_rate": 5.208e-05, "elapsed_time_per_iteration": 6.46913171, "memory(GiB)": 21.51, "elapsed_time": "6h 3m 30s", "remaining_time": "5h 33m 42s", "loss_scale": 1.0, "consumed_samples": 849152, "global_step/max_steps": "3317/6362"} +{"lm loss": 4.92825317, "grad_norm": 0.49955019, "learning_rate": 5.206e-05, "elapsed_time_per_iteration": 6.65189958, "memory(GiB)": 21.51, "elapsed_time": "6h 3m 37s", "remaining_time": "5h 33m 35s", "loss_scale": 1.0, "consumed_samples": 849408, "global_step/max_steps": "3318/6362"} +{"lm loss": 4.92891359, "grad_norm": 0.50539684, "learning_rate": 5.203e-05, "elapsed_time_per_iteration": 6.66139984, "memory(GiB)": 21.51, "elapsed_time": "6h 3m 43s", "remaining_time": "5h 33m 29s", "loss_scale": 1.0, "consumed_samples": 849664, "global_step/max_steps": "3319/6362"} +{"lm loss": 4.92584658, "grad_norm": 0.4916797, "learning_rate": 5.201e-05, "elapsed_time_per_iteration": 6.49168491, "memory(GiB)": 21.51, "elapsed_time": "6h 3m 50s", "remaining_time": "5h 33m 22s", "loss_scale": 1.0, "consumed_samples": 849920, "global_step/max_steps": "3320/6362"} +{"lm loss": 4.9477334, "grad_norm": 0.44149169, "learning_rate": 5.198e-05, "elapsed_time_per_iteration": 6.54111838, "memory(GiB)": 21.51, "elapsed_time": "6h 3m 57s", "remaining_time": "5h 33m 15s", "loss_scale": 1.0, "consumed_samples": 850176, "global_step/max_steps": "3321/6362"} +{"lm loss": 4.930439, "grad_norm": 0.53651696, "learning_rate": 5.196e-05, "elapsed_time_per_iteration": 6.81209445, "memory(GiB)": 21.51, "elapsed_time": "6h 4m 3s", "remaining_time": "5h 33m 9s", "loss_scale": 1.0, "consumed_samples": 850432, "global_step/max_steps": "3322/6362"} +{"lm loss": 4.94312859, "grad_norm": 0.58142489, "learning_rate": 5.193e-05, "elapsed_time_per_iteration": 6.70358825, "memory(GiB)": 21.51, "elapsed_time": "6h 4m 10s", "remaining_time": "5h 33m 3s", "loss_scale": 1.0, "consumed_samples": 850688, "global_step/max_steps": "3323/6362"} +{"lm loss": 4.93033314, "grad_norm": 0.48980939, "learning_rate": 5.19e-05, "elapsed_time_per_iteration": 6.62905812, "memory(GiB)": 21.51, "elapsed_time": "6h 4m 17s", "remaining_time": "5h 32m 56s", "loss_scale": 1.0, "consumed_samples": 850944, "global_step/max_steps": "3324/6362"} +{"lm loss": 4.94726276, "grad_norm": 0.4752233, "learning_rate": 5.188e-05, "elapsed_time_per_iteration": 6.47130299, "memory(GiB)": 21.51, "elapsed_time": "6h 4m 23s", "remaining_time": "5h 32m 49s", "loss_scale": 1.0, "consumed_samples": 851200, "global_step/max_steps": "3325/6362"} +{"lm loss": 4.94028854, "grad_norm": 0.47275764, "learning_rate": 5.185e-05, "elapsed_time_per_iteration": 6.57898879, "memory(GiB)": 21.51, "elapsed_time": "6h 4m 30s", "remaining_time": "5h 32m 43s", "loss_scale": 1.0, "consumed_samples": 851456, "global_step/max_steps": "3326/6362"} +{"lm loss": 4.91680002, "grad_norm": 0.47949606, "learning_rate": 5.183e-05, "elapsed_time_per_iteration": 6.42315292, "memory(GiB)": 21.51, "elapsed_time": "6h 4m 36s", "remaining_time": "5h 32m 36s", "loss_scale": 1.0, "consumed_samples": 851712, "global_step/max_steps": "3327/6362"} +{"lm loss": 4.91633654, "grad_norm": 0.47632182, "learning_rate": 5.18e-05, "elapsed_time_per_iteration": 6.54277349, "memory(GiB)": 21.51, "elapsed_time": "6h 4m 43s", "remaining_time": "5h 32m 29s", "loss_scale": 1.0, "consumed_samples": 851968, "global_step/max_steps": "3328/6362"} +{"lm loss": 4.92617321, "grad_norm": 0.44161218, "learning_rate": 5.178e-05, "elapsed_time_per_iteration": 6.56525493, "memory(GiB)": 21.51, "elapsed_time": "6h 4m 49s", "remaining_time": "5h 32m 23s", "loss_scale": 1.0, "consumed_samples": 852224, "global_step/max_steps": "3329/6362"} +{"lm loss": 4.94438505, "grad_norm": 0.482149, "learning_rate": 5.175e-05, "elapsed_time_per_iteration": 6.38259053, "memory(GiB)": 21.51, "elapsed_time": "6h 4m 56s", "remaining_time": "5h 32m 16s", "loss_scale": 1.0, "consumed_samples": 852480, "global_step/max_steps": "3330/6362"} +{"lm loss": 4.89769793, "grad_norm": 0.53010201, "learning_rate": 5.173e-05, "elapsed_time_per_iteration": 6.6059525, "memory(GiB)": 21.51, "elapsed_time": "6h 5m 2s", "remaining_time": "5h 32m 10s", "loss_scale": 1.0, "consumed_samples": 852736, "global_step/max_steps": "3331/6362"} +{"lm loss": 4.93071985, "grad_norm": 0.53418779, "learning_rate": 5.17e-05, "elapsed_time_per_iteration": 6.51684141, "memory(GiB)": 21.51, "elapsed_time": "6h 5m 9s", "remaining_time": "5h 32m 3s", "loss_scale": 1.0, "consumed_samples": 852992, "global_step/max_steps": "3332/6362"} +{"lm loss": 4.92408228, "grad_norm": 0.55528128, "learning_rate": 5.168e-05, "elapsed_time_per_iteration": 6.59928608, "memory(GiB)": 21.51, "elapsed_time": "6h 5m 15s", "remaining_time": "5h 31m 56s", "loss_scale": 1.0, "consumed_samples": 853248, "global_step/max_steps": "3333/6362"} +{"lm loss": 4.91931581, "grad_norm": 0.47047642, "learning_rate": 5.165e-05, "elapsed_time_per_iteration": 6.42502904, "memory(GiB)": 21.51, "elapsed_time": "6h 5m 22s", "remaining_time": "5h 31m 50s", "loss_scale": 1.0, "consumed_samples": 853504, "global_step/max_steps": "3334/6362"} +{"lm loss": 4.93592501, "grad_norm": 0.48484331, "learning_rate": 5.163e-05, "elapsed_time_per_iteration": 6.64701486, "memory(GiB)": 21.51, "elapsed_time": "6h 5m 28s", "remaining_time": "5h 31m 43s", "loss_scale": 1.0, "consumed_samples": 853760, "global_step/max_steps": "3335/6362"} +{"lm loss": 4.93832016, "grad_norm": 0.52431619, "learning_rate": 5.16e-05, "elapsed_time_per_iteration": 6.65044999, "memory(GiB)": 21.51, "elapsed_time": "6h 5m 35s", "remaining_time": "5h 31m 37s", "loss_scale": 1.0, "consumed_samples": 854016, "global_step/max_steps": "3336/6362"} +{"lm loss": 4.92179489, "grad_norm": 0.44622514, "learning_rate": 5.158e-05, "elapsed_time_per_iteration": 6.5908227, "memory(GiB)": 21.51, "elapsed_time": "6h 5m 42s", "remaining_time": "5h 31m 30s", "loss_scale": 1.0, "consumed_samples": 854272, "global_step/max_steps": "3337/6362"} +{"lm loss": 4.92632294, "grad_norm": 0.53156573, "learning_rate": 5.155e-05, "elapsed_time_per_iteration": 6.69070363, "memory(GiB)": 21.51, "elapsed_time": "6h 5m 48s", "remaining_time": "5h 31m 24s", "loss_scale": 1.0, "consumed_samples": 854528, "global_step/max_steps": "3338/6362"} +{"lm loss": 4.93089151, "grad_norm": 0.49789479, "learning_rate": 5.153e-05, "elapsed_time_per_iteration": 6.71778631, "memory(GiB)": 21.51, "elapsed_time": "6h 5m 55s", "remaining_time": "5h 31m 17s", "loss_scale": 1.0, "consumed_samples": 854784, "global_step/max_steps": "3339/6362"} +{"lm loss": 4.95268154, "grad_norm": 0.46967709, "learning_rate": 5.15e-05, "elapsed_time_per_iteration": 6.61743641, "memory(GiB)": 21.51, "elapsed_time": "6h 6m 2s", "remaining_time": "5h 31m 11s", "loss_scale": 1.0, "consumed_samples": 855040, "global_step/max_steps": "3340/6362"} +{"lm loss": 4.94061565, "grad_norm": 0.4423084, "learning_rate": 5.148e-05, "elapsed_time_per_iteration": 6.56526589, "memory(GiB)": 21.51, "elapsed_time": "6h 6m 8s", "remaining_time": "5h 31m 4s", "loss_scale": 1.0, "consumed_samples": 855296, "global_step/max_steps": "3341/6362"} +{"lm loss": 4.91830492, "grad_norm": 0.5271585, "learning_rate": 5.145e-05, "elapsed_time_per_iteration": 6.70438695, "memory(GiB)": 21.51, "elapsed_time": "6h 6m 15s", "remaining_time": "5h 30m 58s", "loss_scale": 1.0, "consumed_samples": 855552, "global_step/max_steps": "3342/6362"} +{"lm loss": 4.90981054, "grad_norm": 0.54323488, "learning_rate": 5.143e-05, "elapsed_time_per_iteration": 6.68056679, "memory(GiB)": 21.51, "elapsed_time": "6h 6m 22s", "remaining_time": "5h 30m 51s", "loss_scale": 1.0, "consumed_samples": 855808, "global_step/max_steps": "3343/6362"} +{"lm loss": 4.92875624, "grad_norm": 0.54746741, "learning_rate": 5.14e-05, "elapsed_time_per_iteration": 6.78021765, "memory(GiB)": 21.51, "elapsed_time": "6h 6m 28s", "remaining_time": "5h 30m 45s", "loss_scale": 1.0, "consumed_samples": 856064, "global_step/max_steps": "3344/6362"} +{"lm loss": 4.93368483, "grad_norm": 0.54483277, "learning_rate": 5.138e-05, "elapsed_time_per_iteration": 6.52488637, "memory(GiB)": 21.51, "elapsed_time": "6h 6m 35s", "remaining_time": "5h 30m 38s", "loss_scale": 1.0, "consumed_samples": 856320, "global_step/max_steps": "3345/6362"} +{"lm loss": 4.939466, "grad_norm": 0.53012544, "learning_rate": 5.135e-05, "elapsed_time_per_iteration": 6.55503488, "memory(GiB)": 21.51, "elapsed_time": "6h 6m 41s", "remaining_time": "5h 30m 32s", "loss_scale": 1.0, "consumed_samples": 856576, "global_step/max_steps": "3346/6362"} +{"lm loss": 4.93557119, "grad_norm": 0.48841527, "learning_rate": 5.132e-05, "elapsed_time_per_iteration": 6.51832795, "memory(GiB)": 21.51, "elapsed_time": "6h 6m 48s", "remaining_time": "5h 30m 25s", "loss_scale": 1.0, "consumed_samples": 856832, "global_step/max_steps": "3347/6362"} +{"lm loss": 4.91591072, "grad_norm": 0.50709689, "learning_rate": 5.13e-05, "elapsed_time_per_iteration": 6.40572143, "memory(GiB)": 21.51, "elapsed_time": "6h 6m 54s", "remaining_time": "5h 30m 18s", "loss_scale": 1.0, "consumed_samples": 857088, "global_step/max_steps": "3348/6362"} +{"lm loss": 4.92967415, "grad_norm": 0.56388468, "learning_rate": 5.127e-05, "elapsed_time_per_iteration": 6.55736828, "memory(GiB)": 21.51, "elapsed_time": "6h 7m 1s", "remaining_time": "5h 30m 12s", "loss_scale": 1.0, "consumed_samples": 857344, "global_step/max_steps": "3349/6362"} +{"lm loss": 4.90838909, "grad_norm": 0.55035615, "learning_rate": 5.125e-05, "elapsed_time_per_iteration": 6.5727489, "memory(GiB)": 21.51, "elapsed_time": "6h 7m 8s", "remaining_time": "5h 30m 5s", "loss_scale": 1.0, "consumed_samples": 857600, "global_step/max_steps": "3350/6362"} +{"lm loss": 4.93300009, "grad_norm": 0.47665963, "learning_rate": 5.122e-05, "elapsed_time_per_iteration": 6.60405207, "memory(GiB)": 21.51, "elapsed_time": "6h 7m 14s", "remaining_time": "5h 29m 58s", "loss_scale": 1.0, "consumed_samples": 857856, "global_step/max_steps": "3351/6362"} +{"lm loss": 4.91788292, "grad_norm": 0.54583102, "learning_rate": 5.12e-05, "elapsed_time_per_iteration": 6.37240171, "memory(GiB)": 21.51, "elapsed_time": "6h 7m 21s", "remaining_time": "5h 29m 52s", "loss_scale": 1.0, "consumed_samples": 858112, "global_step/max_steps": "3352/6362"} +{"lm loss": 4.92437029, "grad_norm": 0.60975599, "learning_rate": 5.117e-05, "elapsed_time_per_iteration": 6.58731794, "memory(GiB)": 21.51, "elapsed_time": "6h 7m 27s", "remaining_time": "5h 29m 45s", "loss_scale": 1.0, "consumed_samples": 858368, "global_step/max_steps": "3353/6362"} +{"lm loss": 4.90862322, "grad_norm": 0.64402324, "learning_rate": 5.115e-05, "elapsed_time_per_iteration": 6.34888554, "memory(GiB)": 21.51, "elapsed_time": "6h 7m 33s", "remaining_time": "5h 29m 38s", "loss_scale": 1.0, "consumed_samples": 858624, "global_step/max_steps": "3354/6362"} +{"lm loss": 4.91854763, "grad_norm": 0.62539446, "learning_rate": 5.112e-05, "elapsed_time_per_iteration": 6.64083123, "memory(GiB)": 21.51, "elapsed_time": "6h 7m 40s", "remaining_time": "5h 29m 32s", "loss_scale": 1.0, "consumed_samples": 858880, "global_step/max_steps": "3355/6362"} +{"lm loss": 4.92336512, "grad_norm": 0.5152092, "learning_rate": 5.11e-05, "elapsed_time_per_iteration": 6.65685868, "memory(GiB)": 21.51, "elapsed_time": "6h 7m 47s", "remaining_time": "5h 29m 25s", "loss_scale": 1.0, "consumed_samples": 859136, "global_step/max_steps": "3356/6362"} +{"lm loss": 4.93091965, "grad_norm": 0.54008979, "learning_rate": 5.107e-05, "elapsed_time_per_iteration": 6.539258, "memory(GiB)": 21.51, "elapsed_time": "6h 7m 53s", "remaining_time": "5h 29m 19s", "loss_scale": 1.0, "consumed_samples": 859392, "global_step/max_steps": "3357/6362"} +{"lm loss": 4.95028162, "grad_norm": 0.5271793, "learning_rate": 5.105e-05, "elapsed_time_per_iteration": 6.59254551, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 0s", "remaining_time": "5h 29m 12s", "loss_scale": 1.0, "consumed_samples": 859648, "global_step/max_steps": "3358/6362"} +{"lm loss": 4.93889666, "grad_norm": 0.5175485, "learning_rate": 5.102e-05, "elapsed_time_per_iteration": 6.49435186, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 6s", "remaining_time": "5h 29m 6s", "loss_scale": 1.0, "consumed_samples": 859904, "global_step/max_steps": "3359/6362"} +{"lm loss": 4.94208717, "grad_norm": 0.5058713, "learning_rate": 5.1e-05, "elapsed_time_per_iteration": 6.39438629, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 13s", "remaining_time": "5h 28m 59s", "loss_scale": 1.0, "consumed_samples": 860160, "global_step/max_steps": "3360/6362"} +{"lm loss": 4.93853617, "grad_norm": 0.49907386, "learning_rate": 5.097e-05, "elapsed_time_per_iteration": 6.36787415, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 19s", "remaining_time": "5h 28m 52s", "loss_scale": 1.0, "consumed_samples": 860416, "global_step/max_steps": "3361/6362"} +{"lm loss": 4.93178129, "grad_norm": 0.50966364, "learning_rate": 5.095e-05, "elapsed_time_per_iteration": 6.40244341, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 26s", "remaining_time": "5h 28m 45s", "loss_scale": 1.0, "consumed_samples": 860672, "global_step/max_steps": "3362/6362"} +{"lm loss": 4.91708994, "grad_norm": 0.48138008, "learning_rate": 5.092e-05, "elapsed_time_per_iteration": 6.67549825, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 32s", "remaining_time": "5h 28m 39s", "loss_scale": 1.0, "consumed_samples": 860928, "global_step/max_steps": "3363/6362"} +{"lm loss": 4.92904758, "grad_norm": 0.46109426, "learning_rate": 5.09e-05, "elapsed_time_per_iteration": 6.71190834, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 39s", "remaining_time": "5h 28m 32s", "loss_scale": 1.0, "consumed_samples": 861184, "global_step/max_steps": "3364/6362"} +{"lm loss": 4.89268637, "grad_norm": 0.47453085, "learning_rate": 5.087e-05, "elapsed_time_per_iteration": 6.60915685, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 46s", "remaining_time": "5h 28m 26s", "loss_scale": 1.0, "consumed_samples": 861440, "global_step/max_steps": "3365/6362"} +{"lm loss": 4.93966436, "grad_norm": 0.47815561, "learning_rate": 5.085e-05, "elapsed_time_per_iteration": 6.35534191, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 52s", "remaining_time": "5h 28m 19s", "loss_scale": 1.0, "consumed_samples": 861696, "global_step/max_steps": "3366/6362"} +{"lm loss": 4.92713213, "grad_norm": 0.43875602, "learning_rate": 5.082e-05, "elapsed_time_per_iteration": 6.47274542, "memory(GiB)": 21.51, "elapsed_time": "6h 8m 58s", "remaining_time": "5h 28m 12s", "loss_scale": 1.0, "consumed_samples": 861952, "global_step/max_steps": "3367/6362"} +{"lm loss": 4.92783976, "grad_norm": 0.53681368, "learning_rate": 5.08e-05, "elapsed_time_per_iteration": 6.61265445, "memory(GiB)": 21.51, "elapsed_time": "6h 9m 5s", "remaining_time": "5h 28m 6s", "loss_scale": 1.0, "consumed_samples": 862208, "global_step/max_steps": "3368/6362"} +{"lm loss": 4.94550037, "grad_norm": 0.50243425, "learning_rate": 5.077e-05, "elapsed_time_per_iteration": 6.64171028, "memory(GiB)": 21.51, "elapsed_time": "6h 9m 12s", "remaining_time": "5h 27m 59s", "loss_scale": 1.0, "consumed_samples": 862464, "global_step/max_steps": "3369/6362"} +{"lm loss": 4.92197657, "grad_norm": 0.54998356, "learning_rate": 5.074e-05, "elapsed_time_per_iteration": 6.6343255, "memory(GiB)": 21.51, "elapsed_time": "6h 9m 18s", "remaining_time": "5h 27m 53s", "loss_scale": 1.0, "consumed_samples": 862720, "global_step/max_steps": "3370/6362"} +{"lm loss": 4.92252541, "grad_norm": 0.59510654, "learning_rate": 5.072e-05, "elapsed_time_per_iteration": 6.60609317, "memory(GiB)": 21.51, "elapsed_time": "6h 9m 25s", "remaining_time": "5h 27m 46s", "loss_scale": 1.0, "consumed_samples": 862976, "global_step/max_steps": "3371/6362"} +{"lm loss": 4.93377352, "grad_norm": 0.49885094, "learning_rate": 5.069e-05, "elapsed_time_per_iteration": 6.6910696, "memory(GiB)": 21.51, "elapsed_time": "6h 9m 32s", "remaining_time": "5h 27m 40s", "loss_scale": 1.0, "consumed_samples": 863232, "global_step/max_steps": "3372/6362"} +{"lm loss": 4.94410706, "grad_norm": 0.44891343, "learning_rate": 5.067e-05, "elapsed_time_per_iteration": 6.75688219, "memory(GiB)": 21.51, "elapsed_time": "6h 9m 38s", "remaining_time": "5h 27m 33s", "loss_scale": 1.0, "consumed_samples": 863488, "global_step/max_steps": "3373/6362"} +{"lm loss": 4.92617798, "grad_norm": 0.48535597, "learning_rate": 5.064e-05, "elapsed_time_per_iteration": 6.71624041, "memory(GiB)": 21.51, "elapsed_time": "6h 9m 45s", "remaining_time": "5h 27m 27s", "loss_scale": 1.0, "consumed_samples": 863744, "global_step/max_steps": "3374/6362"} +{"lm loss": 4.9315424, "grad_norm": 0.53082353, "learning_rate": 5.062e-05, "elapsed_time_per_iteration": 6.59254885, "memory(GiB)": 21.51, "elapsed_time": "6h 9m 52s", "remaining_time": "5h 27m 20s", "loss_scale": 1.0, "consumed_samples": 864000, "global_step/max_steps": "3375/6362"} +{"lm loss": 4.92267847, "grad_norm": 0.46419945, "learning_rate": 5.059e-05, "elapsed_time_per_iteration": 6.41019607, "memory(GiB)": 21.51, "elapsed_time": "6h 9m 58s", "remaining_time": "5h 27m 14s", "loss_scale": 1.0, "consumed_samples": 864256, "global_step/max_steps": "3376/6362"} +{"lm loss": 4.94296741, "grad_norm": 0.5237636, "learning_rate": 5.057e-05, "elapsed_time_per_iteration": 6.36117673, "memory(GiB)": 21.51, "elapsed_time": "6h 10m 4s", "remaining_time": "5h 27m 7s", "loss_scale": 1.0, "consumed_samples": 864512, "global_step/max_steps": "3377/6362"} +{"lm loss": 4.91941786, "grad_norm": 0.50991625, "learning_rate": 5.054e-05, "elapsed_time_per_iteration": 6.5128839, "memory(GiB)": 21.51, "elapsed_time": "6h 10m 11s", "remaining_time": "5h 27m 0s", "loss_scale": 1.0, "consumed_samples": 864768, "global_step/max_steps": "3378/6362"} +{"lm loss": 4.94135618, "grad_norm": 0.4468689, "learning_rate": 5.052e-05, "elapsed_time_per_iteration": 6.69194603, "memory(GiB)": 21.51, "elapsed_time": "6h 10m 18s", "remaining_time": "5h 26m 54s", "loss_scale": 1.0, "consumed_samples": 865024, "global_step/max_steps": "3379/6362"} +{"lm loss": 4.92695045, "grad_norm": 0.45667186, "learning_rate": 5.049e-05, "elapsed_time_per_iteration": 6.51886868, "memory(GiB)": 21.51, "elapsed_time": "6h 10m 24s", "remaining_time": "5h 26m 47s", "loss_scale": 1.0, "consumed_samples": 865280, "global_step/max_steps": "3380/6362"} +{"lm loss": 4.9230237, "grad_norm": 0.50314295, "learning_rate": 5.047e-05, "elapsed_time_per_iteration": 6.63165355, "memory(GiB)": 21.51, "elapsed_time": "6h 10m 31s", "remaining_time": "5h 26m 41s", "loss_scale": 1.0, "consumed_samples": 865536, "global_step/max_steps": "3381/6362"} +{"lm loss": 4.9368639, "grad_norm": 0.48820373, "learning_rate": 5.044e-05, "elapsed_time_per_iteration": 6.59446716, "memory(GiB)": 21.51, "elapsed_time": "6h 10m 37s", "remaining_time": "5h 26m 34s", "loss_scale": 1.0, "consumed_samples": 865792, "global_step/max_steps": "3382/6362"} +{"lm loss": 4.91336918, "grad_norm": 0.44895783, "learning_rate": 5.042e-05, "elapsed_time_per_iteration": 6.6765058, "memory(GiB)": 21.51, "elapsed_time": "6h 10m 44s", "remaining_time": "5h 26m 28s", "loss_scale": 1.0, "consumed_samples": 866048, "global_step/max_steps": "3383/6362"} +{"lm loss": 4.91710854, "grad_norm": 0.45987391, "learning_rate": 5.039e-05, "elapsed_time_per_iteration": 6.76647973, "memory(GiB)": 21.51, "elapsed_time": "6h 10m 51s", "remaining_time": "5h 26m 21s", "loss_scale": 1.0, "consumed_samples": 866304, "global_step/max_steps": "3384/6362"} +{"lm loss": 4.92926311, "grad_norm": 0.46835402, "learning_rate": 5.037e-05, "elapsed_time_per_iteration": 6.39405704, "memory(GiB)": 21.51, "elapsed_time": "6h 10m 57s", "remaining_time": "5h 26m 14s", "loss_scale": 1.0, "consumed_samples": 866560, "global_step/max_steps": "3385/6362"} +{"lm loss": 4.92413044, "grad_norm": 0.45623729, "learning_rate": 5.034e-05, "elapsed_time_per_iteration": 6.37716866, "memory(GiB)": 21.51, "elapsed_time": "6h 11m 4s", "remaining_time": "5h 26m 8s", "loss_scale": 1.0, "consumed_samples": 866816, "global_step/max_steps": "3386/6362"} +{"lm loss": 4.90378809, "grad_norm": 0.42472026, "learning_rate": 5.032e-05, "elapsed_time_per_iteration": 6.6619184, "memory(GiB)": 21.51, "elapsed_time": "6h 11m 10s", "remaining_time": "5h 26m 1s", "loss_scale": 1.0, "consumed_samples": 867072, "global_step/max_steps": "3387/6362"} +{"lm loss": 4.93369055, "grad_norm": 0.43087444, "learning_rate": 5.029e-05, "elapsed_time_per_iteration": 6.71098804, "memory(GiB)": 21.51, "elapsed_time": "6h 11m 17s", "remaining_time": "5h 25m 55s", "loss_scale": 1.0, "consumed_samples": 867328, "global_step/max_steps": "3388/6362"} +{"lm loss": 4.90416431, "grad_norm": 0.45278054, "learning_rate": 5.027e-05, "elapsed_time_per_iteration": 6.44243836, "memory(GiB)": 21.51, "elapsed_time": "6h 11m 23s", "remaining_time": "5h 25m 48s", "loss_scale": 1.0, "consumed_samples": 867584, "global_step/max_steps": "3389/6362"} +{"lm loss": 4.92475557, "grad_norm": 0.46820292, "learning_rate": 5.024e-05, "elapsed_time_per_iteration": 6.39170909, "memory(GiB)": 21.51, "elapsed_time": "6h 11m 30s", "remaining_time": "5h 25m 41s", "loss_scale": 1.0, "consumed_samples": 867840, "global_step/max_steps": "3390/6362"} +{"lm loss": 4.92555332, "grad_norm": 0.46207938, "learning_rate": 5.022e-05, "elapsed_time_per_iteration": 6.65617251, "memory(GiB)": 21.51, "elapsed_time": "6h 11m 36s", "remaining_time": "5h 25m 35s", "loss_scale": 1.0, "consumed_samples": 868096, "global_step/max_steps": "3391/6362"} +{"lm loss": 4.92831135, "grad_norm": 0.46906045, "learning_rate": 5.019e-05, "elapsed_time_per_iteration": 6.58795738, "memory(GiB)": 21.51, "elapsed_time": "6h 11m 43s", "remaining_time": "5h 25m 28s", "loss_scale": 1.0, "consumed_samples": 868352, "global_step/max_steps": "3392/6362"} +{"lm loss": 4.91916656, "grad_norm": 0.47766581, "learning_rate": 5.017e-05, "elapsed_time_per_iteration": 6.49407387, "memory(GiB)": 21.51, "elapsed_time": "6h 11m 50s", "remaining_time": "5h 25m 22s", "loss_scale": 1.0, "consumed_samples": 868608, "global_step/max_steps": "3393/6362"} +{"lm loss": 4.93227291, "grad_norm": 0.45427251, "learning_rate": 5.014e-05, "elapsed_time_per_iteration": 6.65307188, "memory(GiB)": 21.51, "elapsed_time": "6h 11m 56s", "remaining_time": "5h 25m 15s", "loss_scale": 1.0, "consumed_samples": 868864, "global_step/max_steps": "3394/6362"} +{"lm loss": 4.92335367, "grad_norm": 0.46422827, "learning_rate": 5.011e-05, "elapsed_time_per_iteration": 6.47924042, "memory(GiB)": 21.51, "elapsed_time": "6h 12m 3s", "remaining_time": "5h 25m 8s", "loss_scale": 1.0, "consumed_samples": 869120, "global_step/max_steps": "3395/6362"} +{"lm loss": 4.92629147, "grad_norm": 0.45092297, "learning_rate": 5.009e-05, "elapsed_time_per_iteration": 6.63027215, "memory(GiB)": 21.51, "elapsed_time": "6h 12m 9s", "remaining_time": "5h 25m 2s", "loss_scale": 1.0, "consumed_samples": 869376, "global_step/max_steps": "3396/6362"} +{"lm loss": 4.90791941, "grad_norm": 0.4808422, "learning_rate": 5.006e-05, "elapsed_time_per_iteration": 6.70399141, "memory(GiB)": 21.51, "elapsed_time": "6h 12m 16s", "remaining_time": "5h 24m 55s", "loss_scale": 1.0, "consumed_samples": 869632, "global_step/max_steps": "3397/6362"} +{"lm loss": 4.91538858, "grad_norm": 0.50082713, "learning_rate": 5.004e-05, "elapsed_time_per_iteration": 6.45048189, "memory(GiB)": 21.51, "elapsed_time": "6h 12m 22s", "remaining_time": "5h 24m 49s", "loss_scale": 1.0, "consumed_samples": 869888, "global_step/max_steps": "3398/6362"} +{"lm loss": 4.92156982, "grad_norm": 0.58848757, "learning_rate": 5.001e-05, "elapsed_time_per_iteration": 6.84104776, "memory(GiB)": 21.51, "elapsed_time": "6h 12m 29s", "remaining_time": "5h 24m 42s", "loss_scale": 1.0, "consumed_samples": 870144, "global_step/max_steps": "3399/6362"} +{"lm loss": 4.92446089, "grad_norm": 0.6291787, "learning_rate": 4.999e-05, "elapsed_time_per_iteration": 6.55386853, "memory(GiB)": 21.51, "elapsed_time": "6h 12m 36s", "remaining_time": "5h 24m 36s", "loss_scale": 1.0, "consumed_samples": 870400, "global_step/max_steps": "3400/6362"} +{"lm loss": 4.92943192, "grad_norm": 0.58201832, "learning_rate": 4.996e-05, "elapsed_time_per_iteration": 6.68777156, "memory(GiB)": 21.51, "elapsed_time": "6h 12m 43s", "remaining_time": "5h 24m 29s", "loss_scale": 1.0, "consumed_samples": 870656, "global_step/max_steps": "3401/6362"} +{"lm loss": 4.92165327, "grad_norm": 0.47737399, "learning_rate": 4.994e-05, "elapsed_time_per_iteration": 6.64767861, "memory(GiB)": 21.51, "elapsed_time": "6h 12m 49s", "remaining_time": "5h 24m 23s", "loss_scale": 1.0, "consumed_samples": 870912, "global_step/max_steps": "3402/6362"} +{"lm loss": 4.89467812, "grad_norm": 0.56091881, "learning_rate": 4.991e-05, "elapsed_time_per_iteration": 6.47418928, "memory(GiB)": 21.51, "elapsed_time": "6h 12m 56s", "remaining_time": "5h 24m 16s", "loss_scale": 1.0, "consumed_samples": 871168, "global_step/max_steps": "3403/6362"} +{"lm loss": 4.9143095, "grad_norm": 0.53168416, "learning_rate": 4.989e-05, "elapsed_time_per_iteration": 6.59883142, "memory(GiB)": 21.51, "elapsed_time": "6h 13m 2s", "remaining_time": "5h 24m 10s", "loss_scale": 1.0, "consumed_samples": 871424, "global_step/max_steps": "3404/6362"} +{"lm loss": 4.94756126, "grad_norm": 0.48001963, "learning_rate": 4.986e-05, "elapsed_time_per_iteration": 6.48451209, "memory(GiB)": 21.51, "elapsed_time": "6h 13m 9s", "remaining_time": "5h 24m 3s", "loss_scale": 1.0, "consumed_samples": 871680, "global_step/max_steps": "3405/6362"} +{"lm loss": 4.93233156, "grad_norm": 0.50880665, "learning_rate": 4.984e-05, "elapsed_time_per_iteration": 6.50269389, "memory(GiB)": 21.51, "elapsed_time": "6h 13m 15s", "remaining_time": "5h 23m 56s", "loss_scale": 1.0, "consumed_samples": 871936, "global_step/max_steps": "3406/6362"} +{"lm loss": 4.91182661, "grad_norm": 0.5110538, "learning_rate": 4.981e-05, "elapsed_time_per_iteration": 6.40621257, "memory(GiB)": 21.51, "elapsed_time": "6h 13m 22s", "remaining_time": "5h 23m 50s", "loss_scale": 1.0, "consumed_samples": 872192, "global_step/max_steps": "3407/6362"} +{"lm loss": 4.95614147, "grad_norm": 0.57928824, "learning_rate": 4.979e-05, "elapsed_time_per_iteration": 6.47599506, "memory(GiB)": 21.51, "elapsed_time": "6h 13m 28s", "remaining_time": "5h 23m 43s", "loss_scale": 1.0, "consumed_samples": 872448, "global_step/max_steps": "3408/6362"} +{"lm loss": 4.91730261, "grad_norm": 0.51468939, "learning_rate": 4.976e-05, "elapsed_time_per_iteration": 6.42526174, "memory(GiB)": 21.51, "elapsed_time": "6h 13m 35s", "remaining_time": "5h 23m 36s", "loss_scale": 1.0, "consumed_samples": 872704, "global_step/max_steps": "3409/6362"} +{"lm loss": 4.91887808, "grad_norm": 0.46848214, "learning_rate": 4.974e-05, "elapsed_time_per_iteration": 6.49134851, "memory(GiB)": 21.51, "elapsed_time": "6h 13m 41s", "remaining_time": "5h 23m 30s", "loss_scale": 1.0, "consumed_samples": 872960, "global_step/max_steps": "3410/6362"} +{"lm loss": 4.9428339, "grad_norm": 0.4387846, "learning_rate": 4.971e-05, "elapsed_time_per_iteration": 6.6984117, "memory(GiB)": 21.51, "elapsed_time": "6h 13m 48s", "remaining_time": "5h 23m 23s", "loss_scale": 1.0, "consumed_samples": 873216, "global_step/max_steps": "3411/6362"} +{"lm loss": 4.90448952, "grad_norm": 0.51635492, "learning_rate": 4.969e-05, "elapsed_time_per_iteration": 6.70523572, "memory(GiB)": 21.51, "elapsed_time": "6h 13m 54s", "remaining_time": "5h 23m 17s", "loss_scale": 1.0, "consumed_samples": 873472, "global_step/max_steps": "3412/6362"} +{"lm loss": 4.92213392, "grad_norm": 0.45964989, "learning_rate": 4.966e-05, "elapsed_time_per_iteration": 6.52950954, "memory(GiB)": 21.51, "elapsed_time": "6h 14m 1s", "remaining_time": "5h 23m 10s", "loss_scale": 1.0, "consumed_samples": 873728, "global_step/max_steps": "3413/6362"} +{"lm loss": 4.92794037, "grad_norm": 0.51010424, "learning_rate": 4.964e-05, "elapsed_time_per_iteration": 6.62602592, "memory(GiB)": 21.51, "elapsed_time": "6h 14m 8s", "remaining_time": "5h 23m 3s", "loss_scale": 1.0, "consumed_samples": 873984, "global_step/max_steps": "3414/6362"} +{"lm loss": 4.93958855, "grad_norm": 0.52623779, "learning_rate": 4.961e-05, "elapsed_time_per_iteration": 6.64097309, "memory(GiB)": 21.51, "elapsed_time": "6h 14m 14s", "remaining_time": "5h 22m 57s", "loss_scale": 1.0, "consumed_samples": 874240, "global_step/max_steps": "3415/6362"} +{"lm loss": 4.92820549, "grad_norm": 0.53683615, "learning_rate": 4.959e-05, "elapsed_time_per_iteration": 6.71924639, "memory(GiB)": 21.51, "elapsed_time": "6h 14m 21s", "remaining_time": "5h 22m 51s", "loss_scale": 1.0, "consumed_samples": 874496, "global_step/max_steps": "3416/6362"} +{"lm loss": 4.90566111, "grad_norm": 0.45173863, "learning_rate": 4.956e-05, "elapsed_time_per_iteration": 6.72915602, "memory(GiB)": 21.51, "elapsed_time": "6h 14m 28s", "remaining_time": "5h 22m 44s", "loss_scale": 1.0, "consumed_samples": 874752, "global_step/max_steps": "3417/6362"} +{"lm loss": 4.89116144, "grad_norm": 0.46620816, "learning_rate": 4.954e-05, "elapsed_time_per_iteration": 6.68436837, "memory(GiB)": 21.51, "elapsed_time": "6h 14m 34s", "remaining_time": "5h 22m 38s", "loss_scale": 1.0, "consumed_samples": 875008, "global_step/max_steps": "3418/6362"} +{"lm loss": 4.90942574, "grad_norm": 0.43775347, "learning_rate": 4.951e-05, "elapsed_time_per_iteration": 6.63728499, "memory(GiB)": 21.51, "elapsed_time": "6h 14m 41s", "remaining_time": "5h 22m 31s", "loss_scale": 1.0, "consumed_samples": 875264, "global_step/max_steps": "3419/6362"} +{"lm loss": 4.94767284, "grad_norm": 0.46257001, "learning_rate": 4.949e-05, "elapsed_time_per_iteration": 6.40701222, "memory(GiB)": 21.51, "elapsed_time": "6h 14m 47s", "remaining_time": "5h 22m 24s", "loss_scale": 1.0, "consumed_samples": 875520, "global_step/max_steps": "3420/6362"} +{"lm loss": 4.88790417, "grad_norm": 0.50384265, "learning_rate": 4.946e-05, "elapsed_time_per_iteration": 6.45484257, "memory(GiB)": 21.51, "elapsed_time": "6h 14m 54s", "remaining_time": "5h 22m 18s", "loss_scale": 1.0, "consumed_samples": 875776, "global_step/max_steps": "3421/6362"} +{"lm loss": 4.91998148, "grad_norm": 0.46627727, "learning_rate": 4.943e-05, "elapsed_time_per_iteration": 6.56287932, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 0s", "remaining_time": "5h 22m 11s", "loss_scale": 1.0, "consumed_samples": 876032, "global_step/max_steps": "3422/6362"} +{"lm loss": 4.90582323, "grad_norm": 0.47717798, "learning_rate": 4.941e-05, "elapsed_time_per_iteration": 6.56621981, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 7s", "remaining_time": "5h 22m 4s", "loss_scale": 1.0, "consumed_samples": 876288, "global_step/max_steps": "3423/6362"} +{"lm loss": 4.92934227, "grad_norm": 0.46397442, "learning_rate": 4.938e-05, "elapsed_time_per_iteration": 6.42124748, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 13s", "remaining_time": "5h 21m 58s", "loss_scale": 1.0, "consumed_samples": 876544, "global_step/max_steps": "3424/6362"} +{"lm loss": 4.90961027, "grad_norm": 0.40253332, "learning_rate": 4.936e-05, "elapsed_time_per_iteration": 6.41252923, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 20s", "remaining_time": "5h 21m 51s", "loss_scale": 1.0, "consumed_samples": 876800, "global_step/max_steps": "3425/6362"} +{"lm loss": 4.89905882, "grad_norm": 0.47213674, "learning_rate": 4.933e-05, "elapsed_time_per_iteration": 6.27486968, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 26s", "remaining_time": "5h 21m 44s", "loss_scale": 1.0, "consumed_samples": 877056, "global_step/max_steps": "3426/6362"} +{"lm loss": 4.92953491, "grad_norm": 0.51848078, "learning_rate": 4.931e-05, "elapsed_time_per_iteration": 6.31279182, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 32s", "remaining_time": "5h 21m 37s", "loss_scale": 1.0, "consumed_samples": 877312, "global_step/max_steps": "3427/6362"} +{"lm loss": 4.92416811, "grad_norm": 0.5328154, "learning_rate": 4.928e-05, "elapsed_time_per_iteration": 6.44573426, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 39s", "remaining_time": "5h 21m 31s", "loss_scale": 1.0, "consumed_samples": 877568, "global_step/max_steps": "3428/6362"} +{"lm loss": 4.926754, "grad_norm": 0.4861635, "learning_rate": 4.926e-05, "elapsed_time_per_iteration": 6.66115093, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 46s", "remaining_time": "5h 21m 24s", "loss_scale": 1.0, "consumed_samples": 877824, "global_step/max_steps": "3429/6362"} +{"lm loss": 4.91746092, "grad_norm": 0.45188239, "learning_rate": 4.923e-05, "elapsed_time_per_iteration": 6.49630785, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 52s", "remaining_time": "5h 21m 18s", "loss_scale": 1.0, "consumed_samples": 878080, "global_step/max_steps": "3430/6362"} +{"lm loss": 4.91023588, "grad_norm": 0.54055732, "learning_rate": 4.921e-05, "elapsed_time_per_iteration": 6.54157305, "memory(GiB)": 21.51, "elapsed_time": "6h 15m 59s", "remaining_time": "5h 21m 11s", "loss_scale": 1.0, "consumed_samples": 878336, "global_step/max_steps": "3431/6362"} +{"lm loss": 4.90735769, "grad_norm": 0.60947216, "learning_rate": 4.918e-05, "elapsed_time_per_iteration": 6.50686312, "memory(GiB)": 21.51, "elapsed_time": "6h 16m 5s", "remaining_time": "5h 21m 4s", "loss_scale": 1.0, "consumed_samples": 878592, "global_step/max_steps": "3432/6362"} +{"lm loss": 4.90986061, "grad_norm": 0.45582494, "learning_rate": 4.916e-05, "elapsed_time_per_iteration": 6.44285941, "memory(GiB)": 21.51, "elapsed_time": "6h 16m 11s", "remaining_time": "5h 20m 58s", "loss_scale": 1.0, "consumed_samples": 878848, "global_step/max_steps": "3433/6362"} +{"lm loss": 4.93959761, "grad_norm": 0.51016665, "learning_rate": 4.913e-05, "elapsed_time_per_iteration": 6.48227143, "memory(GiB)": 21.51, "elapsed_time": "6h 16m 18s", "remaining_time": "5h 20m 51s", "loss_scale": 1.0, "consumed_samples": 879104, "global_step/max_steps": "3434/6362"} +{"lm loss": 4.92614651, "grad_norm": 0.50570023, "learning_rate": 4.911e-05, "elapsed_time_per_iteration": 6.46376967, "memory(GiB)": 21.51, "elapsed_time": "6h 16m 24s", "remaining_time": "5h 20m 44s", "loss_scale": 1.0, "consumed_samples": 879360, "global_step/max_steps": "3435/6362"} +{"lm loss": 4.93174553, "grad_norm": 0.46444046, "learning_rate": 4.908e-05, "elapsed_time_per_iteration": 6.56686902, "memory(GiB)": 21.51, "elapsed_time": "6h 16m 31s", "remaining_time": "5h 20m 38s", "loss_scale": 1.0, "consumed_samples": 879616, "global_step/max_steps": "3436/6362"} +{"lm loss": 4.91433001, "grad_norm": 0.42461568, "learning_rate": 4.906e-05, "elapsed_time_per_iteration": 6.54438448, "memory(GiB)": 21.51, "elapsed_time": "6h 16m 38s", "remaining_time": "5h 20m 31s", "loss_scale": 1.0, "consumed_samples": 879872, "global_step/max_steps": "3437/6362"} +{"lm loss": 4.92913198, "grad_norm": 0.39779264, "learning_rate": 4.903e-05, "elapsed_time_per_iteration": 6.50247931, "memory(GiB)": 21.51, "elapsed_time": "6h 16m 44s", "remaining_time": "5h 20m 25s", "loss_scale": 1.0, "consumed_samples": 880128, "global_step/max_steps": "3438/6362"} +{"lm loss": 4.92978811, "grad_norm": 0.46546808, "learning_rate": 4.901e-05, "elapsed_time_per_iteration": 6.57522869, "memory(GiB)": 21.51, "elapsed_time": "6h 16m 51s", "remaining_time": "5h 20m 18s", "loss_scale": 1.0, "consumed_samples": 880384, "global_step/max_steps": "3439/6362"} +{"lm loss": 4.92485476, "grad_norm": 0.44540405, "learning_rate": 4.898e-05, "elapsed_time_per_iteration": 6.78398991, "memory(GiB)": 21.51, "elapsed_time": "6h 16m 57s", "remaining_time": "5h 20m 12s", "loss_scale": 1.0, "consumed_samples": 880640, "global_step/max_steps": "3440/6362"} +{"lm loss": 4.94166183, "grad_norm": 0.49884492, "learning_rate": 4.896e-05, "elapsed_time_per_iteration": 6.51518202, "memory(GiB)": 21.51, "elapsed_time": "6h 17m 4s", "remaining_time": "5h 20m 5s", "loss_scale": 1.0, "consumed_samples": 880896, "global_step/max_steps": "3441/6362"} +{"lm loss": 4.94197845, "grad_norm": 0.46270707, "learning_rate": 4.893e-05, "elapsed_time_per_iteration": 6.40351725, "memory(GiB)": 21.51, "elapsed_time": "6h 17m 10s", "remaining_time": "5h 19m 58s", "loss_scale": 1.0, "consumed_samples": 881152, "global_step/max_steps": "3442/6362"} +{"lm loss": 4.93429184, "grad_norm": 0.46545392, "learning_rate": 4.891e-05, "elapsed_time_per_iteration": 6.67171574, "memory(GiB)": 21.51, "elapsed_time": "6h 17m 17s", "remaining_time": "5h 19m 52s", "loss_scale": 1.0, "consumed_samples": 881408, "global_step/max_steps": "3443/6362"} +{"lm loss": 4.93829632, "grad_norm": 0.50411814, "learning_rate": 4.888e-05, "elapsed_time_per_iteration": 6.56101727, "memory(GiB)": 21.51, "elapsed_time": "6h 17m 24s", "remaining_time": "5h 19m 45s", "loss_scale": 1.0, "consumed_samples": 881664, "global_step/max_steps": "3444/6362"} +{"lm loss": 4.9347105, "grad_norm": 0.4479143, "learning_rate": 4.886e-05, "elapsed_time_per_iteration": 6.62184811, "memory(GiB)": 21.51, "elapsed_time": "6h 17m 30s", "remaining_time": "5h 19m 39s", "loss_scale": 1.0, "consumed_samples": 881920, "global_step/max_steps": "3445/6362"} +{"lm loss": 4.93395424, "grad_norm": 0.54309762, "learning_rate": 4.883e-05, "elapsed_time_per_iteration": 6.27847433, "memory(GiB)": 21.51, "elapsed_time": "6h 17m 36s", "remaining_time": "5h 19m 32s", "loss_scale": 1.0, "consumed_samples": 882176, "global_step/max_steps": "3446/6362"} +{"lm loss": 4.92479372, "grad_norm": 0.65860695, "learning_rate": 4.881e-05, "elapsed_time_per_iteration": 6.42403412, "memory(GiB)": 21.51, "elapsed_time": "6h 17m 43s", "remaining_time": "5h 19m 25s", "loss_scale": 1.0, "consumed_samples": 882432, "global_step/max_steps": "3447/6362"} +{"lm loss": 4.92274523, "grad_norm": 0.63514066, "learning_rate": 4.878e-05, "elapsed_time_per_iteration": 6.66365623, "memory(GiB)": 21.51, "elapsed_time": "6h 17m 50s", "remaining_time": "5h 19m 19s", "loss_scale": 1.0, "consumed_samples": 882688, "global_step/max_steps": "3448/6362"} +{"lm loss": 4.90822029, "grad_norm": 0.53838724, "learning_rate": 4.875e-05, "elapsed_time_per_iteration": 6.6528666, "memory(GiB)": 21.51, "elapsed_time": "6h 17m 56s", "remaining_time": "5h 19m 12s", "loss_scale": 1.0, "consumed_samples": 882944, "global_step/max_steps": "3449/6362"} +{"lm loss": 4.91152573, "grad_norm": 0.4936347, "learning_rate": 4.873e-05, "elapsed_time_per_iteration": 6.70362401, "memory(GiB)": 21.51, "elapsed_time": "6h 18m 3s", "remaining_time": "5h 19m 6s", "loss_scale": 1.0, "consumed_samples": 883200, "global_step/max_steps": "3450/6362"} +{"lm loss": 4.90052176, "grad_norm": 0.622522, "learning_rate": 4.87e-05, "elapsed_time_per_iteration": 6.67910933, "memory(GiB)": 21.51, "elapsed_time": "6h 18m 10s", "remaining_time": "5h 18m 59s", "loss_scale": 1.0, "consumed_samples": 883456, "global_step/max_steps": "3451/6362"} +{"lm loss": 4.92098045, "grad_norm": 0.53930807, "learning_rate": 4.868e-05, "elapsed_time_per_iteration": 6.70425367, "memory(GiB)": 21.51, "elapsed_time": "6h 18m 16s", "remaining_time": "5h 18m 53s", "loss_scale": 1.0, "consumed_samples": 883712, "global_step/max_steps": "3452/6362"} +{"lm loss": 4.91237831, "grad_norm": 0.48318899, "learning_rate": 4.865e-05, "elapsed_time_per_iteration": 6.54520559, "memory(GiB)": 21.51, "elapsed_time": "6h 18m 23s", "remaining_time": "5h 18m 46s", "loss_scale": 1.0, "consumed_samples": 883968, "global_step/max_steps": "3453/6362"} +{"lm loss": 4.93766165, "grad_norm": 0.55268431, "learning_rate": 4.863e-05, "elapsed_time_per_iteration": 6.62339354, "memory(GiB)": 21.51, "elapsed_time": "6h 18m 29s", "remaining_time": "5h 18m 40s", "loss_scale": 1.0, "consumed_samples": 884224, "global_step/max_steps": "3454/6362"} +{"lm loss": 4.92628956, "grad_norm": 0.63020498, "learning_rate": 4.86e-05, "elapsed_time_per_iteration": 6.49125695, "memory(GiB)": 21.51, "elapsed_time": "6h 18m 36s", "remaining_time": "5h 18m 33s", "loss_scale": 1.0, "consumed_samples": 884480, "global_step/max_steps": "3455/6362"} +{"lm loss": 4.90457869, "grad_norm": 0.54178482, "learning_rate": 4.858e-05, "elapsed_time_per_iteration": 6.47426486, "memory(GiB)": 21.51, "elapsed_time": "6h 18m 42s", "remaining_time": "5h 18m 26s", "loss_scale": 1.0, "consumed_samples": 884736, "global_step/max_steps": "3456/6362"} +{"lm loss": 4.91976833, "grad_norm": 0.55449331, "learning_rate": 4.855e-05, "elapsed_time_per_iteration": 6.5993073, "memory(GiB)": 21.51, "elapsed_time": "6h 18m 49s", "remaining_time": "5h 18m 20s", "loss_scale": 1.0, "consumed_samples": 884992, "global_step/max_steps": "3457/6362"} +{"lm loss": 4.90493155, "grad_norm": 0.63386238, "learning_rate": 4.853e-05, "elapsed_time_per_iteration": 6.54595852, "memory(GiB)": 21.51, "elapsed_time": "6h 18m 56s", "remaining_time": "5h 18m 13s", "loss_scale": 1.0, "consumed_samples": 885248, "global_step/max_steps": "3458/6362"} +{"lm loss": 4.91047525, "grad_norm": 0.45567048, "learning_rate": 4.85e-05, "elapsed_time_per_iteration": 6.62563396, "memory(GiB)": 21.51, "elapsed_time": "6h 19m 2s", "remaining_time": "5h 18m 7s", "loss_scale": 1.0, "consumed_samples": 885504, "global_step/max_steps": "3459/6362"} +{"lm loss": 4.92457199, "grad_norm": 0.54001564, "learning_rate": 4.848e-05, "elapsed_time_per_iteration": 6.58753586, "memory(GiB)": 21.51, "elapsed_time": "6h 19m 9s", "remaining_time": "5h 18m 0s", "loss_scale": 1.0, "consumed_samples": 885760, "global_step/max_steps": "3460/6362"} +{"lm loss": 4.92011976, "grad_norm": 0.52359545, "learning_rate": 4.845e-05, "elapsed_time_per_iteration": 6.39227581, "memory(GiB)": 21.51, "elapsed_time": "6h 19m 15s", "remaining_time": "5h 17m 53s", "loss_scale": 1.0, "consumed_samples": 886016, "global_step/max_steps": "3461/6362"} +{"lm loss": 4.90657282, "grad_norm": 0.60123408, "learning_rate": 4.843e-05, "elapsed_time_per_iteration": 6.47943568, "memory(GiB)": 21.51, "elapsed_time": "6h 19m 22s", "remaining_time": "5h 17m 47s", "loss_scale": 1.0, "consumed_samples": 886272, "global_step/max_steps": "3462/6362"} +{"lm loss": 4.9271698, "grad_norm": 0.49764711, "learning_rate": 4.84e-05, "elapsed_time_per_iteration": 6.48446989, "memory(GiB)": 21.51, "elapsed_time": "6h 19m 28s", "remaining_time": "5h 17m 40s", "loss_scale": 1.0, "consumed_samples": 886528, "global_step/max_steps": "3463/6362"} +{"lm loss": 4.93004179, "grad_norm": 0.49611506, "learning_rate": 4.838e-05, "elapsed_time_per_iteration": 6.67033291, "memory(GiB)": 21.51, "elapsed_time": "6h 19m 35s", "remaining_time": "5h 17m 33s", "loss_scale": 1.0, "consumed_samples": 886784, "global_step/max_steps": "3464/6362"} +{"lm loss": 4.9246974, "grad_norm": 0.55455869, "learning_rate": 4.835e-05, "elapsed_time_per_iteration": 6.59608769, "memory(GiB)": 21.51, "elapsed_time": "6h 19m 41s", "remaining_time": "5h 17m 27s", "loss_scale": 1.0, "consumed_samples": 887040, "global_step/max_steps": "3465/6362"} +{"lm loss": 4.93210983, "grad_norm": 0.46324018, "learning_rate": 4.833e-05, "elapsed_time_per_iteration": 6.35495806, "memory(GiB)": 21.51, "elapsed_time": "6h 19m 48s", "remaining_time": "5h 17m 20s", "loss_scale": 1.0, "consumed_samples": 887296, "global_step/max_steps": "3466/6362"} +{"lm loss": 4.93425465, "grad_norm": 0.59034216, "learning_rate": 4.83e-05, "elapsed_time_per_iteration": 6.52023816, "memory(GiB)": 21.51, "elapsed_time": "6h 19m 54s", "remaining_time": "5h 17m 14s", "loss_scale": 1.0, "consumed_samples": 887552, "global_step/max_steps": "3467/6362"} +{"lm loss": 4.90522289, "grad_norm": 0.5804832, "learning_rate": 4.828e-05, "elapsed_time_per_iteration": 6.43008137, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 1s", "remaining_time": "5h 17m 7s", "loss_scale": 1.0, "consumed_samples": 887808, "global_step/max_steps": "3468/6362"} +{"lm loss": 4.93281174, "grad_norm": 0.56317836, "learning_rate": 4.825e-05, "elapsed_time_per_iteration": 6.48930812, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 7s", "remaining_time": "5h 17m 0s", "loss_scale": 1.0, "consumed_samples": 888064, "global_step/max_steps": "3469/6362"} +{"lm loss": 4.91772699, "grad_norm": 0.49175924, "learning_rate": 4.823e-05, "elapsed_time_per_iteration": 6.42849374, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 14s", "remaining_time": "5h 16m 53s", "loss_scale": 1.0, "consumed_samples": 888320, "global_step/max_steps": "3470/6362"} +{"lm loss": 4.92506933, "grad_norm": 0.45063615, "learning_rate": 4.82e-05, "elapsed_time_per_iteration": 6.53282285, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 20s", "remaining_time": "5h 16m 47s", "loss_scale": 1.0, "consumed_samples": 888576, "global_step/max_steps": "3471/6362"} +{"lm loss": 4.91575098, "grad_norm": 0.47581673, "learning_rate": 4.818e-05, "elapsed_time_per_iteration": 6.5211072, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 27s", "remaining_time": "5h 16m 40s", "loss_scale": 1.0, "consumed_samples": 888832, "global_step/max_steps": "3472/6362"} +{"lm loss": 4.92563105, "grad_norm": 0.45022985, "learning_rate": 4.815e-05, "elapsed_time_per_iteration": 6.44099545, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 33s", "remaining_time": "5h 16m 34s", "loss_scale": 1.0, "consumed_samples": 889088, "global_step/max_steps": "3473/6362"} +{"lm loss": 4.89588594, "grad_norm": 0.4340342, "learning_rate": 4.813e-05, "elapsed_time_per_iteration": 6.5414803, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 40s", "remaining_time": "5h 16m 27s", "loss_scale": 1.0, "consumed_samples": 889344, "global_step/max_steps": "3474/6362"} +{"lm loss": 4.92109346, "grad_norm": 0.43681419, "learning_rate": 4.81e-05, "elapsed_time_per_iteration": 6.62955451, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 46s", "remaining_time": "5h 16m 20s", "loss_scale": 1.0, "consumed_samples": 889600, "global_step/max_steps": "3475/6362"} +{"lm loss": 4.91022015, "grad_norm": 0.48194733, "learning_rate": 4.808e-05, "elapsed_time_per_iteration": 6.49412012, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 53s", "remaining_time": "5h 16m 14s", "loss_scale": 1.0, "consumed_samples": 889856, "global_step/max_steps": "3476/6362"} +{"lm loss": 4.90928793, "grad_norm": 0.51009899, "learning_rate": 4.805e-05, "elapsed_time_per_iteration": 6.5778172, "memory(GiB)": 21.51, "elapsed_time": "6h 20m 59s", "remaining_time": "5h 16m 7s", "loss_scale": 1.0, "consumed_samples": 890112, "global_step/max_steps": "3477/6362"} +{"lm loss": 4.92230034, "grad_norm": 0.46668592, "learning_rate": 4.803e-05, "elapsed_time_per_iteration": 6.51857781, "memory(GiB)": 21.51, "elapsed_time": "6h 21m 6s", "remaining_time": "5h 16m 1s", "loss_scale": 1.0, "consumed_samples": 890368, "global_step/max_steps": "3478/6362"} +{"lm loss": 4.93292618, "grad_norm": 0.46048844, "learning_rate": 4.8e-05, "elapsed_time_per_iteration": 6.70456386, "memory(GiB)": 21.51, "elapsed_time": "6h 21m 13s", "remaining_time": "5h 15m 54s", "loss_scale": 1.0, "consumed_samples": 890624, "global_step/max_steps": "3479/6362"} +{"lm loss": 4.90313339, "grad_norm": 0.42558771, "learning_rate": 4.797e-05, "elapsed_time_per_iteration": 6.49725699, "memory(GiB)": 21.51, "elapsed_time": "6h 21m 19s", "remaining_time": "5h 15m 47s", "loss_scale": 1.0, "consumed_samples": 890880, "global_step/max_steps": "3480/6362"} +{"lm loss": 4.9171052, "grad_norm": 0.48457897, "learning_rate": 4.795e-05, "elapsed_time_per_iteration": 6.54407811, "memory(GiB)": 21.51, "elapsed_time": "6h 21m 26s", "remaining_time": "5h 15m 41s", "loss_scale": 1.0, "consumed_samples": 891136, "global_step/max_steps": "3481/6362"} +{"lm loss": 4.92628574, "grad_norm": 0.39507487, "learning_rate": 4.792e-05, "elapsed_time_per_iteration": 6.45740533, "memory(GiB)": 21.51, "elapsed_time": "6h 21m 32s", "remaining_time": "5h 15m 34s", "loss_scale": 1.0, "consumed_samples": 891392, "global_step/max_steps": "3482/6362"} +{"lm loss": 4.91025972, "grad_norm": 0.46954691, "learning_rate": 4.79e-05, "elapsed_time_per_iteration": 6.55618286, "memory(GiB)": 21.51, "elapsed_time": "6h 21m 39s", "remaining_time": "5h 15m 28s", "loss_scale": 1.0, "consumed_samples": 891648, "global_step/max_steps": "3483/6362"} +{"lm loss": 4.8812294, "grad_norm": 0.46674562, "learning_rate": 4.787e-05, "elapsed_time_per_iteration": 6.67098689, "memory(GiB)": 21.51, "elapsed_time": "6h 21m 45s", "remaining_time": "5h 15m 21s", "loss_scale": 1.0, "consumed_samples": 891904, "global_step/max_steps": "3484/6362"} +{"lm loss": 4.93885517, "grad_norm": 0.51265293, "learning_rate": 4.785e-05, "elapsed_time_per_iteration": 6.49535632, "memory(GiB)": 21.51, "elapsed_time": "6h 21m 52s", "remaining_time": "5h 15m 14s", "loss_scale": 1.0, "consumed_samples": 892160, "global_step/max_steps": "3485/6362"} +{"lm loss": 4.90616512, "grad_norm": 0.49626106, "learning_rate": 4.782e-05, "elapsed_time_per_iteration": 6.41455555, "memory(GiB)": 21.51, "elapsed_time": "6h 21m 58s", "remaining_time": "5h 15m 8s", "loss_scale": 1.0, "consumed_samples": 892416, "global_step/max_steps": "3486/6362"} +{"lm loss": 4.92930365, "grad_norm": 0.50199586, "learning_rate": 4.78e-05, "elapsed_time_per_iteration": 6.59219599, "memory(GiB)": 21.51, "elapsed_time": "6h 22m 5s", "remaining_time": "5h 15m 1s", "loss_scale": 1.0, "consumed_samples": 892672, "global_step/max_steps": "3487/6362"} +{"lm loss": 4.92233801, "grad_norm": 0.42054772, "learning_rate": 4.777e-05, "elapsed_time_per_iteration": 6.94636965, "memory(GiB)": 21.51, "elapsed_time": "6h 22m 12s", "remaining_time": "5h 14m 55s", "loss_scale": 1.0, "consumed_samples": 892928, "global_step/max_steps": "3488/6362"} +{"lm loss": 4.92071247, "grad_norm": 0.4713397, "learning_rate": 4.775e-05, "elapsed_time_per_iteration": 6.74428391, "memory(GiB)": 21.51, "elapsed_time": "6h 22m 19s", "remaining_time": "5h 14m 49s", "loss_scale": 1.0, "consumed_samples": 893184, "global_step/max_steps": "3489/6362"} +{"lm loss": 4.92720938, "grad_norm": 0.53710967, "learning_rate": 4.772e-05, "elapsed_time_per_iteration": 6.65323734, "memory(GiB)": 21.51, "elapsed_time": "6h 22m 25s", "remaining_time": "5h 14m 42s", "loss_scale": 1.0, "consumed_samples": 893440, "global_step/max_steps": "3490/6362"} +{"lm loss": 4.91157913, "grad_norm": 0.47720456, "learning_rate": 4.77e-05, "elapsed_time_per_iteration": 6.49822569, "memory(GiB)": 21.51, "elapsed_time": "6h 22m 32s", "remaining_time": "5h 14m 35s", "loss_scale": 1.0, "consumed_samples": 893696, "global_step/max_steps": "3491/6362"} +{"lm loss": 4.90827131, "grad_norm": 0.51871854, "learning_rate": 4.767e-05, "elapsed_time_per_iteration": 6.36962509, "memory(GiB)": 21.51, "elapsed_time": "6h 22m 38s", "remaining_time": "5h 14m 29s", "loss_scale": 1.0, "consumed_samples": 893952, "global_step/max_steps": "3492/6362"} +{"lm loss": 4.93629885, "grad_norm": 0.47772712, "learning_rate": 4.765e-05, "elapsed_time_per_iteration": 6.39209104, "memory(GiB)": 21.51, "elapsed_time": "6h 22m 44s", "remaining_time": "5h 14m 22s", "loss_scale": 1.0, "consumed_samples": 894208, "global_step/max_steps": "3493/6362"} +{"lm loss": 4.91680956, "grad_norm": 0.48056799, "learning_rate": 4.762e-05, "elapsed_time_per_iteration": 6.5209744, "memory(GiB)": 21.51, "elapsed_time": "6h 22m 51s", "remaining_time": "5h 14m 15s", "loss_scale": 1.0, "consumed_samples": 894464, "global_step/max_steps": "3494/6362"} +{"lm loss": 4.87917948, "grad_norm": 0.49440137, "learning_rate": 4.76e-05, "elapsed_time_per_iteration": 6.58493137, "memory(GiB)": 21.51, "elapsed_time": "6h 22m 58s", "remaining_time": "5h 14m 9s", "loss_scale": 1.0, "consumed_samples": 894720, "global_step/max_steps": "3495/6362"} +{"lm loss": 4.93383455, "grad_norm": 0.47026989, "learning_rate": 4.757e-05, "elapsed_time_per_iteration": 6.55144477, "memory(GiB)": 21.51, "elapsed_time": "6h 23m 4s", "remaining_time": "5h 14m 2s", "loss_scale": 1.0, "consumed_samples": 894976, "global_step/max_steps": "3496/6362"} +{"lm loss": 4.91673708, "grad_norm": 0.47795108, "learning_rate": 4.755e-05, "elapsed_time_per_iteration": 6.72638941, "memory(GiB)": 21.51, "elapsed_time": "6h 23m 11s", "remaining_time": "5h 13m 56s", "loss_scale": 1.0, "consumed_samples": 895232, "global_step/max_steps": "3497/6362"} +{"lm loss": 4.9350071, "grad_norm": 0.45832276, "learning_rate": 4.752e-05, "elapsed_time_per_iteration": 6.57819915, "memory(GiB)": 21.51, "elapsed_time": "6h 23m 17s", "remaining_time": "5h 13m 49s", "loss_scale": 1.0, "consumed_samples": 895488, "global_step/max_steps": "3498/6362"} +{"lm loss": 4.90814686, "grad_norm": 0.46676236, "learning_rate": 4.75e-05, "elapsed_time_per_iteration": 6.46744299, "memory(GiB)": 21.51, "elapsed_time": "6h 23m 24s", "remaining_time": "5h 13m 42s", "loss_scale": 1.0, "consumed_samples": 895744, "global_step/max_steps": "3499/6362"} +{"lm loss": 4.92481136, "grad_norm": 0.45786113, "learning_rate": 4.747e-05, "elapsed_time_per_iteration": 6.68754458, "memory(GiB)": 21.51, "elapsed_time": "6h 23m 31s", "remaining_time": "5h 13m 36s", "loss_scale": 1.0, "consumed_samples": 896000, "global_step/max_steps": "3500/6362"} +{"lm loss": 4.9073348, "grad_norm": 0.4282752, "learning_rate": 4.745e-05, "elapsed_time_per_iteration": 6.50413465, "memory(GiB)": 21.51, "elapsed_time": "6h 23m 37s", "remaining_time": "5h 13m 29s", "loss_scale": 1.0, "consumed_samples": 896256, "global_step/max_steps": "3501/6362"} +{"lm loss": 4.92270374, "grad_norm": 0.47206184, "learning_rate": 4.742e-05, "elapsed_time_per_iteration": 6.48430276, "memory(GiB)": 21.51, "elapsed_time": "6h 23m 44s", "remaining_time": "5h 13m 23s", "loss_scale": 1.0, "consumed_samples": 896512, "global_step/max_steps": "3502/6362"} +{"lm loss": 4.9387455, "grad_norm": 0.53047156, "learning_rate": 4.74e-05, "elapsed_time_per_iteration": 6.44924569, "memory(GiB)": 21.51, "elapsed_time": "6h 23m 50s", "remaining_time": "5h 13m 16s", "loss_scale": 1.0, "consumed_samples": 896768, "global_step/max_steps": "3503/6362"} +{"lm loss": 4.92823744, "grad_norm": 0.61698771, "learning_rate": 4.737e-05, "elapsed_time_per_iteration": 6.39422846, "memory(GiB)": 21.51, "elapsed_time": "6h 23m 56s", "remaining_time": "5h 13m 9s", "loss_scale": 1.0, "consumed_samples": 897024, "global_step/max_steps": "3504/6362"} +{"lm loss": 4.92384243, "grad_norm": 0.49929687, "learning_rate": 4.735e-05, "elapsed_time_per_iteration": 6.82115316, "memory(GiB)": 21.51, "elapsed_time": "6h 24m 3s", "remaining_time": "5h 13m 3s", "loss_scale": 1.0, "consumed_samples": 897280, "global_step/max_steps": "3505/6362"} +{"lm loss": 4.92333412, "grad_norm": 0.51206243, "learning_rate": 4.732e-05, "elapsed_time_per_iteration": 6.59782314, "memory(GiB)": 21.51, "elapsed_time": "6h 24m 10s", "remaining_time": "5h 12m 56s", "loss_scale": 1.0, "consumed_samples": 897536, "global_step/max_steps": "3506/6362"} +{"lm loss": 4.92236376, "grad_norm": 0.46908465, "learning_rate": 4.73e-05, "elapsed_time_per_iteration": 6.5543437, "memory(GiB)": 21.51, "elapsed_time": "6h 24m 16s", "remaining_time": "5h 12m 50s", "loss_scale": 1.0, "consumed_samples": 897792, "global_step/max_steps": "3507/6362"} +{"lm loss": 4.93354702, "grad_norm": 0.50476295, "learning_rate": 4.727e-05, "elapsed_time_per_iteration": 6.50591588, "memory(GiB)": 21.51, "elapsed_time": "6h 24m 23s", "remaining_time": "5h 12m 43s", "loss_scale": 1.0, "consumed_samples": 898048, "global_step/max_steps": "3508/6362"} +{"lm loss": 4.90799952, "grad_norm": 0.46084094, "learning_rate": 4.725e-05, "elapsed_time_per_iteration": 6.71303964, "memory(GiB)": 21.51, "elapsed_time": "6h 24m 30s", "remaining_time": "5h 12m 37s", "loss_scale": 1.0, "consumed_samples": 898304, "global_step/max_steps": "3509/6362"} +{"lm loss": 4.90598583, "grad_norm": 0.51877266, "learning_rate": 4.722e-05, "elapsed_time_per_iteration": 6.71415186, "memory(GiB)": 21.51, "elapsed_time": "6h 24m 36s", "remaining_time": "5h 12m 30s", "loss_scale": 1.0, "consumed_samples": 898560, "global_step/max_steps": "3510/6362"} +{"lm loss": 4.91427755, "grad_norm": 0.5598892, "learning_rate": 4.72e-05, "elapsed_time_per_iteration": 6.53915524, "memory(GiB)": 21.51, "elapsed_time": "6h 24m 43s", "remaining_time": "5h 12m 24s", "loss_scale": 1.0, "consumed_samples": 898816, "global_step/max_steps": "3511/6362"} +{"lm loss": 4.91951942, "grad_norm": 0.5465048, "learning_rate": 4.717e-05, "elapsed_time_per_iteration": 6.46299744, "memory(GiB)": 21.51, "elapsed_time": "6h 24m 49s", "remaining_time": "5h 12m 17s", "loss_scale": 1.0, "consumed_samples": 899072, "global_step/max_steps": "3512/6362"} +{"lm loss": 4.87953949, "grad_norm": 0.45362231, "learning_rate": 4.715e-05, "elapsed_time_per_iteration": 6.71800208, "memory(GiB)": 21.51, "elapsed_time": "6h 24m 56s", "remaining_time": "5h 12m 10s", "loss_scale": 1.0, "consumed_samples": 899328, "global_step/max_steps": "3513/6362"} +{"lm loss": 4.92996025, "grad_norm": 0.49421048, "learning_rate": 4.712e-05, "elapsed_time_per_iteration": 6.4195416, "memory(GiB)": 21.51, "elapsed_time": "6h 25m 2s", "remaining_time": "5h 12m 4s", "loss_scale": 1.0, "consumed_samples": 899584, "global_step/max_steps": "3514/6362"} +{"lm loss": 4.91894341, "grad_norm": 0.58247119, "learning_rate": 4.71e-05, "elapsed_time_per_iteration": 6.71327448, "memory(GiB)": 21.51, "elapsed_time": "6h 25m 9s", "remaining_time": "5h 11m 57s", "loss_scale": 1.0, "consumed_samples": 899840, "global_step/max_steps": "3515/6362"} +{"lm loss": 4.91738892, "grad_norm": 0.49775195, "learning_rate": 4.707e-05, "elapsed_time_per_iteration": 6.33946562, "memory(GiB)": 21.51, "elapsed_time": "6h 25m 15s", "remaining_time": "5h 11m 51s", "loss_scale": 1.0, "consumed_samples": 900096, "global_step/max_steps": "3516/6362"} +{"lm loss": 4.92551947, "grad_norm": 0.447743, "learning_rate": 4.705e-05, "elapsed_time_per_iteration": 6.5065794, "memory(GiB)": 21.51, "elapsed_time": "6h 25m 22s", "remaining_time": "5h 11m 44s", "loss_scale": 1.0, "consumed_samples": 900352, "global_step/max_steps": "3517/6362"} +{"lm loss": 4.92244434, "grad_norm": 0.53523457, "learning_rate": 4.702e-05, "elapsed_time_per_iteration": 6.49037838, "memory(GiB)": 21.51, "elapsed_time": "6h 25m 28s", "remaining_time": "5h 11m 37s", "loss_scale": 1.0, "consumed_samples": 900608, "global_step/max_steps": "3518/6362"} +{"lm loss": 4.91298008, "grad_norm": 0.55048591, "learning_rate": 4.7e-05, "elapsed_time_per_iteration": 6.71891475, "memory(GiB)": 21.51, "elapsed_time": "6h 25m 35s", "remaining_time": "5h 11m 31s", "loss_scale": 1.0, "consumed_samples": 900864, "global_step/max_steps": "3519/6362"} +{"lm loss": 4.90733099, "grad_norm": 0.50567126, "learning_rate": 4.697e-05, "elapsed_time_per_iteration": 6.55766368, "memory(GiB)": 21.51, "elapsed_time": "6h 25m 42s", "remaining_time": "5h 11m 24s", "loss_scale": 1.0, "consumed_samples": 901120, "global_step/max_steps": "3520/6362"} +{"lm loss": 4.91196775, "grad_norm": 0.46945077, "learning_rate": 4.694e-05, "elapsed_time_per_iteration": 6.50499082, "memory(GiB)": 21.51, "elapsed_time": "6h 25m 48s", "remaining_time": "5h 11m 18s", "loss_scale": 1.0, "consumed_samples": 901376, "global_step/max_steps": "3521/6362"} +{"lm loss": 4.93345928, "grad_norm": 0.45133376, "learning_rate": 4.692e-05, "elapsed_time_per_iteration": 6.32919407, "memory(GiB)": 21.51, "elapsed_time": "6h 25m 55s", "remaining_time": "5h 11m 11s", "loss_scale": 1.0, "consumed_samples": 901632, "global_step/max_steps": "3522/6362"} +{"lm loss": 4.90899706, "grad_norm": 0.53420043, "learning_rate": 4.689e-05, "elapsed_time_per_iteration": 6.60105944, "memory(GiB)": 21.51, "elapsed_time": "6h 26m 1s", "remaining_time": "5h 11m 4s", "loss_scale": 1.0, "consumed_samples": 901888, "global_step/max_steps": "3523/6362"} +{"lm loss": 4.9086504, "grad_norm": 0.47302866, "learning_rate": 4.687e-05, "elapsed_time_per_iteration": 6.42291927, "memory(GiB)": 21.51, "elapsed_time": "6h 26m 8s", "remaining_time": "5h 10m 58s", "loss_scale": 1.0, "consumed_samples": 902144, "global_step/max_steps": "3524/6362"} +{"lm loss": 4.91370487, "grad_norm": 0.46343252, "learning_rate": 4.684e-05, "elapsed_time_per_iteration": 6.66042161, "memory(GiB)": 21.51, "elapsed_time": "6h 26m 14s", "remaining_time": "5h 10m 51s", "loss_scale": 1.0, "consumed_samples": 902400, "global_step/max_steps": "3525/6362"} +{"lm loss": 4.93611813, "grad_norm": 0.42759123, "learning_rate": 4.682e-05, "elapsed_time_per_iteration": 6.59798121, "memory(GiB)": 21.51, "elapsed_time": "6h 26m 21s", "remaining_time": "5h 10m 45s", "loss_scale": 1.0, "consumed_samples": 902656, "global_step/max_steps": "3526/6362"} +{"lm loss": 4.90633011, "grad_norm": 0.42461887, "learning_rate": 4.679e-05, "elapsed_time_per_iteration": 6.69499159, "memory(GiB)": 21.51, "elapsed_time": "6h 26m 28s", "remaining_time": "5h 10m 38s", "loss_scale": 1.0, "consumed_samples": 902912, "global_step/max_steps": "3527/6362"} +{"lm loss": 4.91317415, "grad_norm": 0.40202695, "learning_rate": 4.677e-05, "elapsed_time_per_iteration": 6.48809052, "memory(GiB)": 21.51, "elapsed_time": "6h 26m 34s", "remaining_time": "5h 10m 31s", "loss_scale": 1.0, "consumed_samples": 903168, "global_step/max_steps": "3528/6362"} +{"lm loss": 4.92107105, "grad_norm": 0.44924533, "learning_rate": 4.674e-05, "elapsed_time_per_iteration": 6.59610152, "memory(GiB)": 21.51, "elapsed_time": "6h 26m 41s", "remaining_time": "5h 10m 25s", "loss_scale": 1.0, "consumed_samples": 903424, "global_step/max_steps": "3529/6362"} +{"lm loss": 4.92404366, "grad_norm": 0.45210287, "learning_rate": 4.672e-05, "elapsed_time_per_iteration": 6.59536147, "memory(GiB)": 21.51, "elapsed_time": "6h 26m 47s", "remaining_time": "5h 10m 18s", "loss_scale": 1.0, "consumed_samples": 903680, "global_step/max_steps": "3530/6362"} +{"lm loss": 4.90527534, "grad_norm": 0.41994417, "learning_rate": 4.669e-05, "elapsed_time_per_iteration": 6.52585578, "memory(GiB)": 21.51, "elapsed_time": "6h 26m 54s", "remaining_time": "5h 10m 12s", "loss_scale": 1.0, "consumed_samples": 903936, "global_step/max_steps": "3531/6362"} +{"lm loss": 4.92512035, "grad_norm": 0.47121552, "learning_rate": 4.667e-05, "elapsed_time_per_iteration": 6.47585964, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 0s", "remaining_time": "5h 10m 5s", "loss_scale": 1.0, "consumed_samples": 904192, "global_step/max_steps": "3532/6362"} +{"lm loss": 4.93310642, "grad_norm": 0.43919426, "learning_rate": 4.664e-05, "elapsed_time_per_iteration": 6.57319617, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 7s", "remaining_time": "5h 9m 58s", "loss_scale": 1.0, "consumed_samples": 904448, "global_step/max_steps": "3533/6362"} +{"lm loss": 4.92784309, "grad_norm": 0.44822472, "learning_rate": 4.662e-05, "elapsed_time_per_iteration": 6.54697847, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 13s", "remaining_time": "5h 9m 52s", "loss_scale": 1.0, "consumed_samples": 904704, "global_step/max_steps": "3534/6362"} +{"lm loss": 4.93767548, "grad_norm": 0.45898849, "learning_rate": 4.659e-05, "elapsed_time_per_iteration": 6.48629236, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 20s", "remaining_time": "5h 9m 45s", "loss_scale": 1.0, "consumed_samples": 904960, "global_step/max_steps": "3535/6362"} +{"lm loss": 4.94100285, "grad_norm": 0.4453246, "learning_rate": 4.657e-05, "elapsed_time_per_iteration": 6.48201442, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 26s", "remaining_time": "5h 9m 39s", "loss_scale": 1.0, "consumed_samples": 905216, "global_step/max_steps": "3536/6362"} +{"lm loss": 4.92026043, "grad_norm": 0.4303076, "learning_rate": 4.654e-05, "elapsed_time_per_iteration": 6.52847052, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 33s", "remaining_time": "5h 9m 32s", "loss_scale": 1.0, "consumed_samples": 905472, "global_step/max_steps": "3537/6362"} +{"lm loss": 4.92121124, "grad_norm": 0.48106948, "learning_rate": 4.652e-05, "elapsed_time_per_iteration": 6.65880823, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 40s", "remaining_time": "5h 9m 25s", "loss_scale": 1.0, "consumed_samples": 905728, "global_step/max_steps": "3538/6362"} +{"lm loss": 4.93385839, "grad_norm": 0.46926612, "learning_rate": 4.649e-05, "elapsed_time_per_iteration": 6.55981135, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 46s", "remaining_time": "5h 9m 19s", "loss_scale": 1.0, "consumed_samples": 905984, "global_step/max_steps": "3539/6362"} +{"lm loss": 4.91150951, "grad_norm": 0.56111175, "learning_rate": 4.647e-05, "elapsed_time_per_iteration": 6.66037107, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 53s", "remaining_time": "5h 9m 12s", "loss_scale": 1.0, "consumed_samples": 906240, "global_step/max_steps": "3540/6362"} +{"lm loss": 4.93637133, "grad_norm": 0.4730638, "learning_rate": 4.644e-05, "elapsed_time_per_iteration": 6.59312725, "memory(GiB)": 21.51, "elapsed_time": "6h 27m 59s", "remaining_time": "5h 9m 6s", "loss_scale": 1.0, "consumed_samples": 906496, "global_step/max_steps": "3541/6362"} +{"lm loss": 4.94349289, "grad_norm": 0.53039229, "learning_rate": 4.642e-05, "elapsed_time_per_iteration": 6.50557995, "memory(GiB)": 21.51, "elapsed_time": "6h 28m 6s", "remaining_time": "5h 8m 59s", "loss_scale": 1.0, "consumed_samples": 906752, "global_step/max_steps": "3542/6362"} +{"lm loss": 4.922894, "grad_norm": 0.50070548, "learning_rate": 4.639e-05, "elapsed_time_per_iteration": 6.72651076, "memory(GiB)": 21.51, "elapsed_time": "6h 28m 13s", "remaining_time": "5h 8m 53s", "loss_scale": 1.0, "consumed_samples": 907008, "global_step/max_steps": "3543/6362"} +{"lm loss": 4.92572021, "grad_norm": 0.48756894, "learning_rate": 4.637e-05, "elapsed_time_per_iteration": 6.79788661, "memory(GiB)": 21.51, "elapsed_time": "6h 28m 19s", "remaining_time": "5h 8m 46s", "loss_scale": 1.0, "consumed_samples": 907264, "global_step/max_steps": "3544/6362"} +{"lm loss": 4.93785238, "grad_norm": 0.49189389, "learning_rate": 4.634e-05, "elapsed_time_per_iteration": 6.65411067, "memory(GiB)": 21.51, "elapsed_time": "6h 28m 26s", "remaining_time": "5h 8m 40s", "loss_scale": 1.0, "consumed_samples": 907520, "global_step/max_steps": "3545/6362"} +{"lm loss": 4.92424202, "grad_norm": 0.48905751, "learning_rate": 4.632e-05, "elapsed_time_per_iteration": 6.86169243, "memory(GiB)": 21.51, "elapsed_time": "6h 28m 33s", "remaining_time": "5h 8m 33s", "loss_scale": 1.0, "consumed_samples": 907776, "global_step/max_steps": "3546/6362"} +{"lm loss": 4.89358425, "grad_norm": 0.5788576, "learning_rate": 4.629e-05, "elapsed_time_per_iteration": 6.89196563, "memory(GiB)": 21.51, "elapsed_time": "6h 28m 40s", "remaining_time": "5h 8m 27s", "loss_scale": 1.0, "consumed_samples": 908032, "global_step/max_steps": "3547/6362"} +{"lm loss": 4.9098525, "grad_norm": 0.4985624, "learning_rate": 4.627e-05, "elapsed_time_per_iteration": 6.64601088, "memory(GiB)": 21.51, "elapsed_time": "6h 28m 46s", "remaining_time": "5h 8m 21s", "loss_scale": 1.0, "consumed_samples": 908288, "global_step/max_steps": "3548/6362"} +{"lm loss": 4.92958355, "grad_norm": 0.56064934, "learning_rate": 4.624e-05, "elapsed_time_per_iteration": 6.80040026, "memory(GiB)": 21.51, "elapsed_time": "6h 28m 53s", "remaining_time": "5h 8m 14s", "loss_scale": 1.0, "consumed_samples": 908544, "global_step/max_steps": "3549/6362"} +{"lm loss": 4.91879177, "grad_norm": 0.45721552, "learning_rate": 4.622e-05, "elapsed_time_per_iteration": 6.72753835, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 0s", "remaining_time": "5h 8m 8s", "loss_scale": 1.0, "consumed_samples": 908800, "global_step/max_steps": "3550/6362"} +{"lm loss": 4.94390774, "grad_norm": 0.45982412, "learning_rate": 4.619e-05, "elapsed_time_per_iteration": 6.39636064, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 6s", "remaining_time": "5h 8m 1s", "loss_scale": 1.0, "consumed_samples": 909056, "global_step/max_steps": "3551/6362"} +{"lm loss": 4.92352533, "grad_norm": 0.48623368, "learning_rate": 4.617e-05, "elapsed_time_per_iteration": 6.59774685, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 13s", "remaining_time": "5h 7m 54s", "loss_scale": 1.0, "consumed_samples": 909312, "global_step/max_steps": "3552/6362"} +{"lm loss": 4.92141008, "grad_norm": 0.58433688, "learning_rate": 4.614e-05, "elapsed_time_per_iteration": 6.50419784, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 19s", "remaining_time": "5h 7m 48s", "loss_scale": 1.0, "consumed_samples": 909568, "global_step/max_steps": "3553/6362"} +{"lm loss": 4.91162014, "grad_norm": 0.4960565, "learning_rate": 4.612e-05, "elapsed_time_per_iteration": 6.6426003, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 26s", "remaining_time": "5h 7m 41s", "loss_scale": 1.0, "consumed_samples": 909824, "global_step/max_steps": "3554/6362"} +{"lm loss": 4.90098, "grad_norm": 0.44159225, "learning_rate": 4.609e-05, "elapsed_time_per_iteration": 6.71222305, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 33s", "remaining_time": "5h 7m 35s", "loss_scale": 1.0, "consumed_samples": 910080, "global_step/max_steps": "3555/6362"} +{"lm loss": 4.89894056, "grad_norm": 0.43607441, "learning_rate": 4.607e-05, "elapsed_time_per_iteration": 6.59286666, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 39s", "remaining_time": "5h 7m 28s", "loss_scale": 1.0, "consumed_samples": 910336, "global_step/max_steps": "3556/6362"} +{"lm loss": 4.89502907, "grad_norm": 0.49864608, "learning_rate": 4.604e-05, "elapsed_time_per_iteration": 6.41573763, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 46s", "remaining_time": "5h 7m 22s", "loss_scale": 1.0, "consumed_samples": 910592, "global_step/max_steps": "3557/6362"} +{"lm loss": 4.92322302, "grad_norm": 0.50999135, "learning_rate": 4.602e-05, "elapsed_time_per_iteration": 6.4691186, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 52s", "remaining_time": "5h 7m 15s", "loss_scale": 1.0, "consumed_samples": 910848, "global_step/max_steps": "3558/6362"} +{"lm loss": 4.91308022, "grad_norm": 0.43929896, "learning_rate": 4.599e-05, "elapsed_time_per_iteration": 6.57178879, "memory(GiB)": 21.51, "elapsed_time": "6h 29m 59s", "remaining_time": "5h 7m 8s", "loss_scale": 1.0, "consumed_samples": 911104, "global_step/max_steps": "3559/6362"} +{"lm loss": 4.91568279, "grad_norm": 0.45957288, "learning_rate": 4.597e-05, "elapsed_time_per_iteration": 6.547194, "memory(GiB)": 21.51, "elapsed_time": "6h 30m 5s", "remaining_time": "5h 7m 2s", "loss_scale": 1.0, "consumed_samples": 911360, "global_step/max_steps": "3560/6362"} +{"lm loss": 4.90633774, "grad_norm": 0.47528365, "learning_rate": 4.594e-05, "elapsed_time_per_iteration": 6.52097297, "memory(GiB)": 21.51, "elapsed_time": "6h 30m 12s", "remaining_time": "5h 6m 55s", "loss_scale": 1.0, "consumed_samples": 911616, "global_step/max_steps": "3561/6362"} +{"lm loss": 4.92286015, "grad_norm": 0.49549389, "learning_rate": 4.592e-05, "elapsed_time_per_iteration": 6.48524022, "memory(GiB)": 21.51, "elapsed_time": "6h 30m 18s", "remaining_time": "5h 6m 49s", "loss_scale": 1.0, "consumed_samples": 911872, "global_step/max_steps": "3562/6362"} +{"lm loss": 4.8954258, "grad_norm": 0.43658078, "learning_rate": 4.589e-05, "elapsed_time_per_iteration": 6.43419003, "memory(GiB)": 21.51, "elapsed_time": "6h 30m 25s", "remaining_time": "5h 6m 42s", "loss_scale": 1.0, "consumed_samples": 912128, "global_step/max_steps": "3563/6362"} +{"lm loss": 4.92163277, "grad_norm": 0.46956781, "learning_rate": 4.587e-05, "elapsed_time_per_iteration": 6.64875698, "memory(GiB)": 21.51, "elapsed_time": "6h 30m 31s", "remaining_time": "5h 6m 35s", "loss_scale": 1.0, "consumed_samples": 912384, "global_step/max_steps": "3564/6362"} +{"lm loss": 4.91637754, "grad_norm": 0.50900376, "learning_rate": 4.584e-05, "elapsed_time_per_iteration": 6.35917044, "memory(GiB)": 21.51, "elapsed_time": "6h 30m 38s", "remaining_time": "5h 6m 29s", "loss_scale": 1.0, "consumed_samples": 912640, "global_step/max_steps": "3565/6362"} +{"lm loss": 4.92538786, "grad_norm": 0.52499014, "learning_rate": 4.582e-05, "elapsed_time_per_iteration": 6.50485659, "memory(GiB)": 21.51, "elapsed_time": "6h 30m 44s", "remaining_time": "5h 6m 22s", "loss_scale": 1.0, "consumed_samples": 912896, "global_step/max_steps": "3566/6362"} +{"lm loss": 4.92091084, "grad_norm": 0.55033112, "learning_rate": 4.579e-05, "elapsed_time_per_iteration": 6.89756918, "memory(GiB)": 21.51, "elapsed_time": "6h 30m 51s", "remaining_time": "5h 6m 16s", "loss_scale": 1.0, "consumed_samples": 913152, "global_step/max_steps": "3567/6362"} +{"lm loss": 4.92873573, "grad_norm": 0.44747564, "learning_rate": 4.577e-05, "elapsed_time_per_iteration": 6.45814562, "memory(GiB)": 21.51, "elapsed_time": "6h 30m 58s", "remaining_time": "5h 6m 9s", "loss_scale": 1.0, "consumed_samples": 913408, "global_step/max_steps": "3568/6362"} +{"lm loss": 4.89601803, "grad_norm": 0.43854553, "learning_rate": 4.574e-05, "elapsed_time_per_iteration": 6.70672345, "memory(GiB)": 21.51, "elapsed_time": "6h 31m 4s", "remaining_time": "5h 6m 2s", "loss_scale": 1.0, "consumed_samples": 913664, "global_step/max_steps": "3569/6362"} +{"lm loss": 4.92107344, "grad_norm": 0.47741842, "learning_rate": 4.572e-05, "elapsed_time_per_iteration": 6.40450573, "memory(GiB)": 21.51, "elapsed_time": "6h 31m 11s", "remaining_time": "5h 5m 56s", "loss_scale": 1.0, "consumed_samples": 913920, "global_step/max_steps": "3570/6362"} +{"lm loss": 4.92930603, "grad_norm": 0.43127644, "learning_rate": 4.569e-05, "elapsed_time_per_iteration": 6.33797693, "memory(GiB)": 21.51, "elapsed_time": "6h 31m 17s", "remaining_time": "5h 5m 49s", "loss_scale": 1.0, "consumed_samples": 914176, "global_step/max_steps": "3571/6362"} +{"lm loss": 4.91435194, "grad_norm": 0.44016743, "learning_rate": 4.567e-05, "elapsed_time_per_iteration": 6.71488667, "memory(GiB)": 21.51, "elapsed_time": "6h 31m 24s", "remaining_time": "5h 5m 43s", "loss_scale": 1.0, "consumed_samples": 914432, "global_step/max_steps": "3572/6362"} +{"lm loss": 4.91318274, "grad_norm": 0.43431771, "learning_rate": 4.564e-05, "elapsed_time_per_iteration": 6.47488832, "memory(GiB)": 21.51, "elapsed_time": "6h 31m 30s", "remaining_time": "5h 5m 36s", "loss_scale": 1.0, "consumed_samples": 914688, "global_step/max_steps": "3573/6362"} +{"lm loss": 4.91601324, "grad_norm": 0.41767398, "learning_rate": 4.562e-05, "elapsed_time_per_iteration": 6.52437449, "memory(GiB)": 21.51, "elapsed_time": "6h 31m 37s", "remaining_time": "5h 5m 29s", "loss_scale": 1.0, "consumed_samples": 914944, "global_step/max_steps": "3574/6362"} +{"lm loss": 4.93582678, "grad_norm": 0.42443594, "learning_rate": 4.559e-05, "elapsed_time_per_iteration": 6.37885618, "memory(GiB)": 21.51, "elapsed_time": "6h 31m 43s", "remaining_time": "5h 5m 23s", "loss_scale": 1.0, "consumed_samples": 915200, "global_step/max_steps": "3575/6362"} +{"lm loss": 4.91452026, "grad_norm": 0.44687456, "learning_rate": 4.557e-05, "elapsed_time_per_iteration": 6.40347314, "memory(GiB)": 21.51, "elapsed_time": "6h 31m 50s", "remaining_time": "5h 5m 16s", "loss_scale": 1.0, "consumed_samples": 915456, "global_step/max_steps": "3576/6362"} +{"lm loss": 4.90319157, "grad_norm": 0.42735431, "learning_rate": 4.554e-05, "elapsed_time_per_iteration": 6.47347426, "memory(GiB)": 21.51, "elapsed_time": "6h 31m 56s", "remaining_time": "5h 5m 9s", "loss_scale": 1.0, "consumed_samples": 915712, "global_step/max_steps": "3577/6362"} +{"lm loss": 4.91983318, "grad_norm": 0.46498016, "learning_rate": 4.552e-05, "elapsed_time_per_iteration": 6.45058942, "memory(GiB)": 21.51, "elapsed_time": "6h 32m 3s", "remaining_time": "5h 5m 3s", "loss_scale": 1.0, "consumed_samples": 915968, "global_step/max_steps": "3578/6362"} +{"lm loss": 4.91877031, "grad_norm": 0.49921533, "learning_rate": 4.549e-05, "elapsed_time_per_iteration": 6.4678278, "memory(GiB)": 21.51, "elapsed_time": "6h 32m 9s", "remaining_time": "5h 4m 56s", "loss_scale": 1.0, "consumed_samples": 916224, "global_step/max_steps": "3579/6362"} +{"lm loss": 4.94139719, "grad_norm": 0.46353638, "learning_rate": 4.547e-05, "elapsed_time_per_iteration": 6.6101706, "memory(GiB)": 21.51, "elapsed_time": "6h 32m 16s", "remaining_time": "5h 4m 49s", "loss_scale": 1.0, "consumed_samples": 916480, "global_step/max_steps": "3580/6362"} +{"lm loss": 4.91277409, "grad_norm": 0.55407572, "learning_rate": 4.544e-05, "elapsed_time_per_iteration": 6.38117957, "memory(GiB)": 21.51, "elapsed_time": "6h 32m 22s", "remaining_time": "5h 4m 43s", "loss_scale": 1.0, "consumed_samples": 916736, "global_step/max_steps": "3581/6362"} +{"lm loss": 4.91314125, "grad_norm": 0.49466977, "learning_rate": 4.542e-05, "elapsed_time_per_iteration": 6.44869614, "memory(GiB)": 21.51, "elapsed_time": "6h 32m 28s", "remaining_time": "5h 4m 36s", "loss_scale": 1.0, "consumed_samples": 916992, "global_step/max_steps": "3582/6362"} +{"lm loss": 4.9033637, "grad_norm": 0.52196264, "learning_rate": 4.539e-05, "elapsed_time_per_iteration": 6.3536191, "memory(GiB)": 21.51, "elapsed_time": "6h 32m 35s", "remaining_time": "5h 4m 29s", "loss_scale": 1.0, "consumed_samples": 917248, "global_step/max_steps": "3583/6362"} +{"lm loss": 4.91401339, "grad_norm": 0.53856766, "learning_rate": 4.537e-05, "elapsed_time_per_iteration": 6.68898892, "memory(GiB)": 21.51, "elapsed_time": "6h 32m 42s", "remaining_time": "5h 4m 23s", "loss_scale": 1.0, "consumed_samples": 917504, "global_step/max_steps": "3584/6362"} +{"lm loss": 4.89950657, "grad_norm": 0.42706877, "learning_rate": 4.534e-05, "elapsed_time_per_iteration": 6.52572846, "memory(GiB)": 21.51, "elapsed_time": "6h 32m 48s", "remaining_time": "5h 4m 16s", "loss_scale": 1.0, "consumed_samples": 917760, "global_step/max_steps": "3585/6362"} +{"lm loss": 4.93851995, "grad_norm": 0.50976408, "learning_rate": 4.532e-05, "elapsed_time_per_iteration": 6.24838901, "memory(GiB)": 21.51, "elapsed_time": "6h 32m 54s", "remaining_time": "5h 4m 9s", "loss_scale": 1.0, "consumed_samples": 918016, "global_step/max_steps": "3586/6362"} +{"lm loss": 4.91694927, "grad_norm": 0.43124631, "learning_rate": 4.529e-05, "elapsed_time_per_iteration": 6.7514801, "memory(GiB)": 21.51, "elapsed_time": "6h 33m 1s", "remaining_time": "5h 4m 3s", "loss_scale": 1.0, "consumed_samples": 918272, "global_step/max_steps": "3587/6362"} +{"lm loss": 4.89799166, "grad_norm": 0.49872825, "learning_rate": 4.527e-05, "elapsed_time_per_iteration": 6.50788593, "memory(GiB)": 21.51, "elapsed_time": "6h 33m 8s", "remaining_time": "5h 3m 56s", "loss_scale": 1.0, "consumed_samples": 918528, "global_step/max_steps": "3588/6362"} +{"lm loss": 4.91221046, "grad_norm": 0.46292824, "learning_rate": 4.524e-05, "elapsed_time_per_iteration": 6.4955771, "memory(GiB)": 21.51, "elapsed_time": "6h 33m 14s", "remaining_time": "5h 3m 50s", "loss_scale": 1.0, "consumed_samples": 918784, "global_step/max_steps": "3589/6362"} +{"lm loss": 4.94211531, "grad_norm": 0.49576044, "learning_rate": 4.522e-05, "elapsed_time_per_iteration": 6.42208552, "memory(GiB)": 21.51, "elapsed_time": "6h 33m 20s", "remaining_time": "5h 3m 43s", "loss_scale": 1.0, "consumed_samples": 919040, "global_step/max_steps": "3590/6362"} +{"lm loss": 4.90050316, "grad_norm": 0.49435726, "learning_rate": 4.519e-05, "elapsed_time_per_iteration": 6.52558279, "memory(GiB)": 21.51, "elapsed_time": "6h 33m 27s", "remaining_time": "5h 3m 36s", "loss_scale": 1.0, "consumed_samples": 919296, "global_step/max_steps": "3591/6362"} +{"lm loss": 4.92625809, "grad_norm": 0.43541583, "learning_rate": 4.517e-05, "elapsed_time_per_iteration": 7.35913038, "memory(GiB)": 21.51, "elapsed_time": "6h 33m 34s", "remaining_time": "5h 3m 30s", "loss_scale": 1.0, "consumed_samples": 919552, "global_step/max_steps": "3592/6362"} +{"lm loss": 4.90909719, "grad_norm": 0.49471056, "learning_rate": 4.514e-05, "elapsed_time_per_iteration": 6.6380322, "memory(GiB)": 21.51, "elapsed_time": "6h 33m 41s", "remaining_time": "5h 3m 24s", "loss_scale": 1.0, "consumed_samples": 919808, "global_step/max_steps": "3593/6362"} +{"lm loss": 4.91393805, "grad_norm": 0.57717258, "learning_rate": 4.512e-05, "elapsed_time_per_iteration": 6.86779022, "memory(GiB)": 21.51, "elapsed_time": "6h 33m 48s", "remaining_time": "5h 3m 17s", "loss_scale": 1.0, "consumed_samples": 920064, "global_step/max_steps": "3594/6362"} +{"lm loss": 4.90366268, "grad_norm": 0.62377882, "learning_rate": 4.509e-05, "elapsed_time_per_iteration": 6.73506069, "memory(GiB)": 21.51, "elapsed_time": "6h 33m 55s", "remaining_time": "5h 3m 11s", "loss_scale": 1.0, "consumed_samples": 920320, "global_step/max_steps": "3595/6362"} +{"lm loss": 4.89258385, "grad_norm": 0.50963289, "learning_rate": 4.507e-05, "elapsed_time_per_iteration": 6.43403554, "memory(GiB)": 21.51, "elapsed_time": "6h 34m 1s", "remaining_time": "5h 3m 4s", "loss_scale": 1.0, "consumed_samples": 920576, "global_step/max_steps": "3596/6362"} +{"lm loss": 4.92437983, "grad_norm": 0.43941036, "learning_rate": 4.504e-05, "elapsed_time_per_iteration": 6.68348742, "memory(GiB)": 21.51, "elapsed_time": "6h 34m 8s", "remaining_time": "5h 2m 58s", "loss_scale": 1.0, "consumed_samples": 920832, "global_step/max_steps": "3597/6362"} +{"lm loss": 4.92460299, "grad_norm": 0.50317287, "learning_rate": 4.502e-05, "elapsed_time_per_iteration": 6.67050004, "memory(GiB)": 21.51, "elapsed_time": "6h 34m 14s", "remaining_time": "5h 2m 51s", "loss_scale": 1.0, "consumed_samples": 921088, "global_step/max_steps": "3598/6362"} +{"lm loss": 4.90207243, "grad_norm": 0.48344108, "learning_rate": 4.499e-05, "elapsed_time_per_iteration": 6.62407303, "memory(GiB)": 21.51, "elapsed_time": "6h 34m 21s", "remaining_time": "5h 2m 45s", "loss_scale": 1.0, "consumed_samples": 921344, "global_step/max_steps": "3599/6362"} +{"lm loss": 4.93161678, "grad_norm": 0.4530881, "learning_rate": 4.497e-05, "elapsed_time_per_iteration": 6.35815191, "memory(GiB)": 21.51, "elapsed_time": "6h 34m 27s", "remaining_time": "5h 2m 38s", "loss_scale": 1.0, "consumed_samples": 921600, "global_step/max_steps": "3600/6362"} +{"lm loss": 4.92874479, "grad_norm": 0.42470691, "learning_rate": 4.494e-05, "elapsed_time_per_iteration": 6.54294491, "memory(GiB)": 21.51, "elapsed_time": "6h 34m 34s", "remaining_time": "5h 2m 31s", "loss_scale": 1.0, "consumed_samples": 921856, "global_step/max_steps": "3601/6362"} +{"lm loss": 4.92666292, "grad_norm": 0.44518, "learning_rate": 4.492e-05, "elapsed_time_per_iteration": 6.58224607, "memory(GiB)": 21.51, "elapsed_time": "6h 34m 40s", "remaining_time": "5h 2m 25s", "loss_scale": 1.0, "consumed_samples": 922112, "global_step/max_steps": "3602/6362"} +{"lm loss": 4.91564703, "grad_norm": 0.46449155, "learning_rate": 4.489e-05, "elapsed_time_per_iteration": 6.42454386, "memory(GiB)": 21.51, "elapsed_time": "6h 34m 47s", "remaining_time": "5h 2m 18s", "loss_scale": 1.0, "consumed_samples": 922368, "global_step/max_steps": "3603/6362"} +{"lm loss": 4.93157291, "grad_norm": 0.49773023, "learning_rate": 4.487e-05, "elapsed_time_per_iteration": 6.55869842, "memory(GiB)": 21.51, "elapsed_time": "6h 34m 53s", "remaining_time": "5h 2m 12s", "loss_scale": 1.0, "consumed_samples": 922624, "global_step/max_steps": "3604/6362"} +{"lm loss": 4.93445587, "grad_norm": 0.56110424, "learning_rate": 4.484e-05, "elapsed_time_per_iteration": 6.42731333, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 0s", "remaining_time": "5h 2m 5s", "loss_scale": 1.0, "consumed_samples": 922880, "global_step/max_steps": "3605/6362"} +{"lm loss": 4.89315271, "grad_norm": 0.58444059, "learning_rate": 4.482e-05, "elapsed_time_per_iteration": 6.50635648, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 6s", "remaining_time": "5h 1m 58s", "loss_scale": 1.0, "consumed_samples": 923136, "global_step/max_steps": "3606/6362"} +{"lm loss": 4.91773891, "grad_norm": 0.43250898, "learning_rate": 4.479e-05, "elapsed_time_per_iteration": 6.75884724, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 13s", "remaining_time": "5h 1m 52s", "loss_scale": 1.0, "consumed_samples": 923392, "global_step/max_steps": "3607/6362"} +{"lm loss": 4.93944836, "grad_norm": 0.44003841, "learning_rate": 4.477e-05, "elapsed_time_per_iteration": 6.57667112, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 20s", "remaining_time": "5h 1m 45s", "loss_scale": 1.0, "consumed_samples": 923648, "global_step/max_steps": "3608/6362"} +{"lm loss": 4.90117741, "grad_norm": 0.47161645, "learning_rate": 4.474e-05, "elapsed_time_per_iteration": 6.3915236, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 26s", "remaining_time": "5h 1m 39s", "loss_scale": 1.0, "consumed_samples": 923904, "global_step/max_steps": "3609/6362"} +{"lm loss": 4.91583443, "grad_norm": 0.47287658, "learning_rate": 4.472e-05, "elapsed_time_per_iteration": 6.52680254, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 33s", "remaining_time": "5h 1m 32s", "loss_scale": 1.0, "consumed_samples": 924160, "global_step/max_steps": "3610/6362"} +{"lm loss": 4.90318584, "grad_norm": 0.4526062, "learning_rate": 4.469e-05, "elapsed_time_per_iteration": 6.46154976, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 39s", "remaining_time": "5h 1m 25s", "loss_scale": 1.0, "consumed_samples": 924416, "global_step/max_steps": "3611/6362"} +{"lm loss": 4.90842152, "grad_norm": 0.40735221, "learning_rate": 4.467e-05, "elapsed_time_per_iteration": 6.941571, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 46s", "remaining_time": "5h 1m 19s", "loss_scale": 1.0, "consumed_samples": 924672, "global_step/max_steps": "3612/6362"} +{"lm loss": 4.90900135, "grad_norm": 0.45264834, "learning_rate": 4.464e-05, "elapsed_time_per_iteration": 6.86741805, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 53s", "remaining_time": "5h 1m 13s", "loss_scale": 1.0, "consumed_samples": 924928, "global_step/max_steps": "3613/6362"} +{"lm loss": 4.91855812, "grad_norm": 0.50392294, "learning_rate": 4.462e-05, "elapsed_time_per_iteration": 6.49524188, "memory(GiB)": 21.51, "elapsed_time": "6h 35m 59s", "remaining_time": "5h 1m 6s", "loss_scale": 1.0, "consumed_samples": 925184, "global_step/max_steps": "3614/6362"} +{"lm loss": 4.91865635, "grad_norm": 0.52781117, "learning_rate": 4.459e-05, "elapsed_time_per_iteration": 6.88185048, "memory(GiB)": 21.51, "elapsed_time": "6h 36m 6s", "remaining_time": "5h 1m 0s", "loss_scale": 1.0, "consumed_samples": 925440, "global_step/max_steps": "3615/6362"} +{"lm loss": 4.90109491, "grad_norm": 0.41574219, "learning_rate": 4.457e-05, "elapsed_time_per_iteration": 6.76003528, "memory(GiB)": 21.51, "elapsed_time": "6h 36m 13s", "remaining_time": "5h 0m 53s", "loss_scale": 1.0, "consumed_samples": 925696, "global_step/max_steps": "3616/6362"} +{"lm loss": 4.89169598, "grad_norm": 0.48479596, "learning_rate": 4.454e-05, "elapsed_time_per_iteration": 6.51486301, "memory(GiB)": 21.51, "elapsed_time": "6h 36m 20s", "remaining_time": "5h 0m 47s", "loss_scale": 1.0, "consumed_samples": 925952, "global_step/max_steps": "3617/6362"} +{"lm loss": 4.92147207, "grad_norm": 0.47943506, "learning_rate": 4.452e-05, "elapsed_time_per_iteration": 6.40655184, "memory(GiB)": 21.51, "elapsed_time": "6h 36m 26s", "remaining_time": "5h 0m 40s", "loss_scale": 1.0, "consumed_samples": 926208, "global_step/max_steps": "3618/6362"} +{"lm loss": 4.92472887, "grad_norm": 0.48709905, "learning_rate": 4.449e-05, "elapsed_time_per_iteration": 6.62637258, "memory(GiB)": 21.51, "elapsed_time": "6h 36m 33s", "remaining_time": "5h 0m 33s", "loss_scale": 1.0, "consumed_samples": 926464, "global_step/max_steps": "3619/6362"} +{"lm loss": 4.92070246, "grad_norm": 0.50702739, "learning_rate": 4.447e-05, "elapsed_time_per_iteration": 6.38517165, "memory(GiB)": 21.51, "elapsed_time": "6h 36m 39s", "remaining_time": "5h 0m 27s", "loss_scale": 1.0, "consumed_samples": 926720, "global_step/max_steps": "3620/6362"} +{"lm loss": 4.92159653, "grad_norm": 0.43010172, "learning_rate": 4.444e-05, "elapsed_time_per_iteration": 6.68740177, "memory(GiB)": 21.51, "elapsed_time": "6h 36m 46s", "remaining_time": "5h 0m 20s", "loss_scale": 1.0, "consumed_samples": 926976, "global_step/max_steps": "3621/6362"} +{"lm loss": 4.91648054, "grad_norm": 0.48306516, "learning_rate": 4.442e-05, "elapsed_time_per_iteration": 6.56297731, "memory(GiB)": 21.51, "elapsed_time": "6h 36m 52s", "remaining_time": "5h 0m 14s", "loss_scale": 1.0, "consumed_samples": 927232, "global_step/max_steps": "3622/6362"} +{"lm loss": 4.90637589, "grad_norm": 0.44841889, "learning_rate": 4.439e-05, "elapsed_time_per_iteration": 6.26223254, "memory(GiB)": 21.51, "elapsed_time": "6h 36m 59s", "remaining_time": "5h 0m 7s", "loss_scale": 1.0, "consumed_samples": 927488, "global_step/max_steps": "3623/6362"} +{"lm loss": 4.90762091, "grad_norm": 0.46660984, "learning_rate": 4.437e-05, "elapsed_time_per_iteration": 6.50801945, "memory(GiB)": 21.51, "elapsed_time": "6h 37m 5s", "remaining_time": "5h 0m 0s", "loss_scale": 1.0, "consumed_samples": 927744, "global_step/max_steps": "3624/6362"} +{"lm loss": 4.9364872, "grad_norm": 0.46876958, "learning_rate": 4.434e-05, "elapsed_time_per_iteration": 6.3016603, "memory(GiB)": 21.51, "elapsed_time": "6h 37m 11s", "remaining_time": "4h 59m 53s", "loss_scale": 1.0, "consumed_samples": 928000, "global_step/max_steps": "3625/6362"} +{"lm loss": 4.91851616, "grad_norm": 0.43627018, "learning_rate": 4.432e-05, "elapsed_time_per_iteration": 6.38719678, "memory(GiB)": 21.51, "elapsed_time": "6h 37m 18s", "remaining_time": "4h 59m 47s", "loss_scale": 1.0, "consumed_samples": 928256, "global_step/max_steps": "3626/6362"} +{"lm loss": 4.90934229, "grad_norm": 0.45154041, "learning_rate": 4.429e-05, "elapsed_time_per_iteration": 6.62336683, "memory(GiB)": 21.51, "elapsed_time": "6h 37m 24s", "remaining_time": "4h 59m 40s", "loss_scale": 1.0, "consumed_samples": 928512, "global_step/max_steps": "3627/6362"} +{"lm loss": 4.91089296, "grad_norm": 0.43936017, "learning_rate": 4.427e-05, "elapsed_time_per_iteration": 6.64906287, "memory(GiB)": 21.51, "elapsed_time": "6h 37m 31s", "remaining_time": "4h 59m 34s", "loss_scale": 1.0, "consumed_samples": 928768, "global_step/max_steps": "3628/6362"} +{"lm loss": 4.92661476, "grad_norm": 0.4866479, "learning_rate": 4.424e-05, "elapsed_time_per_iteration": 6.45074511, "memory(GiB)": 21.51, "elapsed_time": "6h 37m 37s", "remaining_time": "4h 59m 27s", "loss_scale": 1.0, "consumed_samples": 929024, "global_step/max_steps": "3629/6362"} +{"lm loss": 4.89033413, "grad_norm": 0.47267067, "learning_rate": 4.422e-05, "elapsed_time_per_iteration": 6.56526399, "memory(GiB)": 21.51, "elapsed_time": "6h 37m 44s", "remaining_time": "4h 59m 20s", "loss_scale": 1.0, "consumed_samples": 929280, "global_step/max_steps": "3630/6362"} +{"lm loss": 4.92156696, "grad_norm": 0.42061329, "learning_rate": 4.419e-05, "elapsed_time_per_iteration": 6.53762412, "memory(GiB)": 21.51, "elapsed_time": "6h 37m 51s", "remaining_time": "4h 59m 14s", "loss_scale": 1.0, "consumed_samples": 929536, "global_step/max_steps": "3631/6362"} +{"lm loss": 4.92300749, "grad_norm": 0.47779605, "learning_rate": 4.417e-05, "elapsed_time_per_iteration": 6.44020367, "memory(GiB)": 21.51, "elapsed_time": "6h 37m 57s", "remaining_time": "4h 59m 7s", "loss_scale": 1.0, "consumed_samples": 929792, "global_step/max_steps": "3632/6362"} +{"lm loss": 4.91442585, "grad_norm": 0.42957655, "learning_rate": 4.414e-05, "elapsed_time_per_iteration": 6.66794848, "memory(GiB)": 21.51, "elapsed_time": "6h 38m 4s", "remaining_time": "4h 59m 1s", "loss_scale": 1.0, "consumed_samples": 930048, "global_step/max_steps": "3633/6362"} +{"lm loss": 4.91676378, "grad_norm": 0.4490597, "learning_rate": 4.412e-05, "elapsed_time_per_iteration": 6.35963392, "memory(GiB)": 21.51, "elapsed_time": "6h 38m 10s", "remaining_time": "4h 58m 54s", "loss_scale": 1.0, "consumed_samples": 930304, "global_step/max_steps": "3634/6362"} +{"lm loss": 4.93869591, "grad_norm": 0.43341163, "learning_rate": 4.409e-05, "elapsed_time_per_iteration": 6.4067874, "memory(GiB)": 21.51, "elapsed_time": "6h 38m 16s", "remaining_time": "4h 58m 47s", "loss_scale": 1.0, "consumed_samples": 930560, "global_step/max_steps": "3635/6362"} +{"lm loss": 4.91064787, "grad_norm": 0.39640018, "learning_rate": 4.407e-05, "elapsed_time_per_iteration": 6.58053017, "memory(GiB)": 21.51, "elapsed_time": "6h 38m 23s", "remaining_time": "4h 58m 41s", "loss_scale": 1.0, "consumed_samples": 930816, "global_step/max_steps": "3636/6362"} +{"lm loss": 4.91658926, "grad_norm": 0.40758806, "learning_rate": 4.404e-05, "elapsed_time_per_iteration": 6.54884434, "memory(GiB)": 21.51, "elapsed_time": "6h 38m 30s", "remaining_time": "4h 58m 34s", "loss_scale": 1.0, "consumed_samples": 931072, "global_step/max_steps": "3637/6362"} +{"lm loss": 4.90938282, "grad_norm": 0.4656432, "learning_rate": 4.402e-05, "elapsed_time_per_iteration": 6.44447303, "memory(GiB)": 21.51, "elapsed_time": "6h 38m 36s", "remaining_time": "4h 58m 27s", "loss_scale": 1.0, "consumed_samples": 931328, "global_step/max_steps": "3638/6362"} +{"lm loss": 4.8985796, "grad_norm": 0.53772521, "learning_rate": 4.399e-05, "elapsed_time_per_iteration": 6.35296297, "memory(GiB)": 21.51, "elapsed_time": "6h 38m 42s", "remaining_time": "4h 58m 21s", "loss_scale": 1.0, "consumed_samples": 931584, "global_step/max_steps": "3639/6362"} +{"lm loss": 4.92628813, "grad_norm": 0.54058629, "learning_rate": 4.397e-05, "elapsed_time_per_iteration": 6.48586559, "memory(GiB)": 21.51, "elapsed_time": "6h 38m 49s", "remaining_time": "4h 58m 14s", "loss_scale": 1.0, "consumed_samples": 931840, "global_step/max_steps": "3640/6362"} +{"lm loss": 4.91441202, "grad_norm": 0.51278377, "learning_rate": 4.394e-05, "elapsed_time_per_iteration": 6.80649567, "memory(GiB)": 21.51, "elapsed_time": "6h 38m 56s", "remaining_time": "4h 58m 7s", "loss_scale": 1.0, "consumed_samples": 932096, "global_step/max_steps": "3641/6362"} +{"lm loss": 4.92725182, "grad_norm": 0.44002664, "learning_rate": 4.392e-05, "elapsed_time_per_iteration": 6.71728826, "memory(GiB)": 21.51, "elapsed_time": "6h 39m 2s", "remaining_time": "4h 58m 1s", "loss_scale": 1.0, "consumed_samples": 932352, "global_step/max_steps": "3642/6362"} +{"lm loss": 4.91536951, "grad_norm": 0.40603799, "learning_rate": 4.389e-05, "elapsed_time_per_iteration": 6.70883083, "memory(GiB)": 21.51, "elapsed_time": "6h 39m 9s", "remaining_time": "4h 57m 55s", "loss_scale": 1.0, "consumed_samples": 932608, "global_step/max_steps": "3643/6362"} +{"lm loss": 4.90170908, "grad_norm": 0.49240661, "learning_rate": 4.387e-05, "elapsed_time_per_iteration": 6.78154755, "memory(GiB)": 21.51, "elapsed_time": "6h 39m 16s", "remaining_time": "4h 57m 48s", "loss_scale": 1.0, "consumed_samples": 932864, "global_step/max_steps": "3644/6362"} +{"lm loss": 4.91901064, "grad_norm": 0.43868819, "learning_rate": 4.384e-05, "elapsed_time_per_iteration": 6.68518806, "memory(GiB)": 21.51, "elapsed_time": "6h 39m 23s", "remaining_time": "4h 57m 42s", "loss_scale": 1.0, "consumed_samples": 933120, "global_step/max_steps": "3645/6362"} +{"lm loss": 4.89526367, "grad_norm": 0.43822202, "learning_rate": 4.382e-05, "elapsed_time_per_iteration": 6.77680016, "memory(GiB)": 21.51, "elapsed_time": "6h 39m 29s", "remaining_time": "4h 57m 35s", "loss_scale": 1.0, "consumed_samples": 933376, "global_step/max_steps": "3646/6362"} +{"lm loss": 4.87977123, "grad_norm": 0.42412665, "learning_rate": 4.379e-05, "elapsed_time_per_iteration": 6.52236247, "memory(GiB)": 21.51, "elapsed_time": "6h 39m 36s", "remaining_time": "4h 57m 29s", "loss_scale": 1.0, "consumed_samples": 933632, "global_step/max_steps": "3647/6362"} +{"lm loss": 4.92243004, "grad_norm": 0.48054704, "learning_rate": 4.377e-05, "elapsed_time_per_iteration": 6.34860182, "memory(GiB)": 21.51, "elapsed_time": "6h 39m 42s", "remaining_time": "4h 57m 22s", "loss_scale": 1.0, "consumed_samples": 933888, "global_step/max_steps": "3648/6362"} +{"lm loss": 4.91238832, "grad_norm": 0.45616099, "learning_rate": 4.374e-05, "elapsed_time_per_iteration": 6.60334539, "memory(GiB)": 21.51, "elapsed_time": "6h 39m 49s", "remaining_time": "4h 57m 15s", "loss_scale": 1.0, "consumed_samples": 934144, "global_step/max_steps": "3649/6362"} +{"lm loss": 4.91837025, "grad_norm": 0.45319867, "learning_rate": 4.372e-05, "elapsed_time_per_iteration": 6.70341706, "memory(GiB)": 21.51, "elapsed_time": "6h 39m 55s", "remaining_time": "4h 57m 9s", "loss_scale": 1.0, "consumed_samples": 934400, "global_step/max_steps": "3650/6362"} +{"lm loss": 4.90346861, "grad_norm": 0.48548588, "learning_rate": 4.37e-05, "elapsed_time_per_iteration": 6.69375992, "memory(GiB)": 21.51, "elapsed_time": "6h 40m 2s", "remaining_time": "4h 57m 2s", "loss_scale": 1.0, "consumed_samples": 934656, "global_step/max_steps": "3651/6362"} +{"lm loss": 4.91022778, "grad_norm": 0.45123389, "learning_rate": 4.367e-05, "elapsed_time_per_iteration": 6.6346097, "memory(GiB)": 21.51, "elapsed_time": "6h 40m 9s", "remaining_time": "4h 56m 56s", "loss_scale": 1.0, "consumed_samples": 934912, "global_step/max_steps": "3652/6362"} +{"lm loss": 4.91741991, "grad_norm": 0.44892174, "learning_rate": 4.365e-05, "elapsed_time_per_iteration": 6.4511652, "memory(GiB)": 21.51, "elapsed_time": "6h 40m 15s", "remaining_time": "4h 56m 49s", "loss_scale": 1.0, "consumed_samples": 935168, "global_step/max_steps": "3653/6362"} +{"lm loss": 4.91325331, "grad_norm": 0.49704218, "learning_rate": 4.362e-05, "elapsed_time_per_iteration": 6.64292836, "memory(GiB)": 21.51, "elapsed_time": "6h 40m 22s", "remaining_time": "4h 56m 43s", "loss_scale": 1.0, "consumed_samples": 935424, "global_step/max_steps": "3654/6362"} +{"lm loss": 4.92393494, "grad_norm": 0.44867998, "learning_rate": 4.36e-05, "elapsed_time_per_iteration": 6.65356827, "memory(GiB)": 21.51, "elapsed_time": "6h 40m 29s", "remaining_time": "4h 56m 36s", "loss_scale": 1.0, "consumed_samples": 935680, "global_step/max_steps": "3655/6362"} +{"lm loss": 4.91499138, "grad_norm": 0.45962256, "learning_rate": 4.357e-05, "elapsed_time_per_iteration": 6.79477477, "memory(GiB)": 21.51, "elapsed_time": "6h 40m 35s", "remaining_time": "4h 56m 30s", "loss_scale": 1.0, "consumed_samples": 935936, "global_step/max_steps": "3656/6362"} +{"lm loss": 4.91633224, "grad_norm": 0.47543025, "learning_rate": 4.355e-05, "elapsed_time_per_iteration": 6.44619751, "memory(GiB)": 21.51, "elapsed_time": "6h 40m 42s", "remaining_time": "4h 56m 23s", "loss_scale": 1.0, "consumed_samples": 936192, "global_step/max_steps": "3657/6362"} +{"lm loss": 4.91504717, "grad_norm": 0.47268158, "learning_rate": 4.352e-05, "elapsed_time_per_iteration": 6.55579758, "memory(GiB)": 21.51, "elapsed_time": "6h 40m 48s", "remaining_time": "4h 56m 16s", "loss_scale": 1.0, "consumed_samples": 936448, "global_step/max_steps": "3658/6362"} +{"lm loss": 4.92656708, "grad_norm": 0.46002525, "learning_rate": 4.35e-05, "elapsed_time_per_iteration": 6.48198414, "memory(GiB)": 21.51, "elapsed_time": "6h 40m 55s", "remaining_time": "4h 56m 10s", "loss_scale": 1.0, "consumed_samples": 936704, "global_step/max_steps": "3659/6362"} +{"lm loss": 4.88602781, "grad_norm": 0.42370528, "learning_rate": 4.347e-05, "elapsed_time_per_iteration": 6.44986963, "memory(GiB)": 21.51, "elapsed_time": "6h 41m 1s", "remaining_time": "4h 56m 3s", "loss_scale": 1.0, "consumed_samples": 936960, "global_step/max_steps": "3660/6362"} +{"lm loss": 4.90591192, "grad_norm": 0.51153821, "learning_rate": 4.345e-05, "elapsed_time_per_iteration": 6.47557425, "memory(GiB)": 21.51, "elapsed_time": "6h 41m 8s", "remaining_time": "4h 55m 56s", "loss_scale": 1.0, "consumed_samples": 937216, "global_step/max_steps": "3661/6362"} +{"lm loss": 4.91876984, "grad_norm": 0.57405818, "learning_rate": 4.342e-05, "elapsed_time_per_iteration": 6.33112836, "memory(GiB)": 21.51, "elapsed_time": "6h 41m 14s", "remaining_time": "4h 55m 50s", "loss_scale": 1.0, "consumed_samples": 937472, "global_step/max_steps": "3662/6362"} +{"lm loss": 4.89621305, "grad_norm": 0.47675383, "learning_rate": 4.34e-05, "elapsed_time_per_iteration": 6.713346, "memory(GiB)": 21.51, "elapsed_time": "6h 41m 21s", "remaining_time": "4h 55m 43s", "loss_scale": 1.0, "consumed_samples": 937728, "global_step/max_steps": "3663/6362"} +{"lm loss": 4.90433216, "grad_norm": 0.4150829, "learning_rate": 4.337e-05, "elapsed_time_per_iteration": 6.66460586, "memory(GiB)": 21.51, "elapsed_time": "6h 41m 27s", "remaining_time": "4h 55m 37s", "loss_scale": 1.0, "consumed_samples": 937984, "global_step/max_steps": "3664/6362"} +{"lm loss": 4.92136669, "grad_norm": 0.43244556, "learning_rate": 4.335e-05, "elapsed_time_per_iteration": 6.60090828, "memory(GiB)": 21.51, "elapsed_time": "6h 41m 34s", "remaining_time": "4h 55m 30s", "loss_scale": 1.0, "consumed_samples": 938240, "global_step/max_steps": "3665/6362"} +{"lm loss": 4.91906118, "grad_norm": 0.50774503, "learning_rate": 4.332e-05, "elapsed_time_per_iteration": 6.398103, "memory(GiB)": 21.51, "elapsed_time": "6h 41m 40s", "remaining_time": "4h 55m 24s", "loss_scale": 1.0, "consumed_samples": 938496, "global_step/max_steps": "3666/6362"} +{"lm loss": 4.89003563, "grad_norm": 0.52578825, "learning_rate": 4.33e-05, "elapsed_time_per_iteration": 6.52126646, "memory(GiB)": 21.51, "elapsed_time": "6h 41m 47s", "remaining_time": "4h 55m 17s", "loss_scale": 1.0, "consumed_samples": 938752, "global_step/max_steps": "3667/6362"} +{"lm loss": 4.91883278, "grad_norm": 0.42133972, "learning_rate": 4.327e-05, "elapsed_time_per_iteration": 6.54404616, "memory(GiB)": 21.51, "elapsed_time": "6h 41m 54s", "remaining_time": "4h 55m 10s", "loss_scale": 1.0, "consumed_samples": 939008, "global_step/max_steps": "3668/6362"} +{"lm loss": 4.92601061, "grad_norm": 0.45880663, "learning_rate": 4.325e-05, "elapsed_time_per_iteration": 6.35611987, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 0s", "remaining_time": "4h 55m 4s", "loss_scale": 1.0, "consumed_samples": 939264, "global_step/max_steps": "3669/6362"} +{"lm loss": 4.90604734, "grad_norm": 0.45918182, "learning_rate": 4.322e-05, "elapsed_time_per_iteration": 6.46826124, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 6s", "remaining_time": "4h 54m 57s", "loss_scale": 1.0, "consumed_samples": 939520, "global_step/max_steps": "3670/6362"} +{"lm loss": 4.91121864, "grad_norm": 0.51772517, "learning_rate": 4.32e-05, "elapsed_time_per_iteration": 6.55123782, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 13s", "remaining_time": "4h 54m 50s", "loss_scale": 1.0, "consumed_samples": 939776, "global_step/max_steps": "3671/6362"} +{"lm loss": 4.90876532, "grad_norm": 0.48485631, "learning_rate": 4.317e-05, "elapsed_time_per_iteration": 6.5602951, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 19s", "remaining_time": "4h 54m 44s", "loss_scale": 1.0, "consumed_samples": 940032, "global_step/max_steps": "3672/6362"} +{"lm loss": 4.93375349, "grad_norm": 0.43592259, "learning_rate": 4.315e-05, "elapsed_time_per_iteration": 6.72788644, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 26s", "remaining_time": "4h 54m 37s", "loss_scale": 1.0, "consumed_samples": 940288, "global_step/max_steps": "3673/6362"} +{"lm loss": 4.89928436, "grad_norm": 0.5292365, "learning_rate": 4.312e-05, "elapsed_time_per_iteration": 6.79183006, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 33s", "remaining_time": "4h 54m 31s", "loss_scale": 1.0, "consumed_samples": 940544, "global_step/max_steps": "3674/6362"} +{"lm loss": 4.89334869, "grad_norm": 0.48994765, "learning_rate": 4.31e-05, "elapsed_time_per_iteration": 6.60605216, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 40s", "remaining_time": "4h 54m 24s", "loss_scale": 1.0, "consumed_samples": 940800, "global_step/max_steps": "3675/6362"} +{"lm loss": 4.89710045, "grad_norm": 0.46800762, "learning_rate": 4.307e-05, "elapsed_time_per_iteration": 6.52899718, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 46s", "remaining_time": "4h 54m 18s", "loss_scale": 1.0, "consumed_samples": 941056, "global_step/max_steps": "3676/6362"} +{"lm loss": 4.91690493, "grad_norm": 0.49178866, "learning_rate": 4.305e-05, "elapsed_time_per_iteration": 6.6795423, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 53s", "remaining_time": "4h 54m 11s", "loss_scale": 1.0, "consumed_samples": 941312, "global_step/max_steps": "3677/6362"} +{"lm loss": 4.90837765, "grad_norm": 0.45580927, "learning_rate": 4.302e-05, "elapsed_time_per_iteration": 6.60775137, "memory(GiB)": 21.51, "elapsed_time": "6h 42m 59s", "remaining_time": "4h 54m 5s", "loss_scale": 1.0, "consumed_samples": 941568, "global_step/max_steps": "3678/6362"} +{"lm loss": 4.92748785, "grad_norm": 0.47450563, "learning_rate": 4.3e-05, "elapsed_time_per_iteration": 6.33729792, "memory(GiB)": 21.51, "elapsed_time": "6h 43m 6s", "remaining_time": "4h 53m 58s", "loss_scale": 1.0, "consumed_samples": 941824, "global_step/max_steps": "3679/6362"} +{"lm loss": 4.90635204, "grad_norm": 0.42236829, "learning_rate": 4.297e-05, "elapsed_time_per_iteration": 6.51988864, "memory(GiB)": 21.51, "elapsed_time": "6h 43m 12s", "remaining_time": "4h 53m 51s", "loss_scale": 1.0, "consumed_samples": 942080, "global_step/max_steps": "3680/6362"} +{"lm loss": 4.93003607, "grad_norm": 0.44897747, "learning_rate": 4.295e-05, "elapsed_time_per_iteration": 6.46370625, "memory(GiB)": 21.51, "elapsed_time": "6h 43m 19s", "remaining_time": "4h 53m 45s", "loss_scale": 1.0, "consumed_samples": 942336, "global_step/max_steps": "3681/6362"} +{"lm loss": 4.92870235, "grad_norm": 0.45507818, "learning_rate": 4.292e-05, "elapsed_time_per_iteration": 6.49413848, "memory(GiB)": 21.51, "elapsed_time": "6h 43m 25s", "remaining_time": "4h 53m 38s", "loss_scale": 1.0, "consumed_samples": 942592, "global_step/max_steps": "3682/6362"} +{"lm loss": 4.90637589, "grad_norm": 0.44086471, "learning_rate": 4.29e-05, "elapsed_time_per_iteration": 6.67082953, "memory(GiB)": 21.51, "elapsed_time": "6h 43m 32s", "remaining_time": "4h 53m 31s", "loss_scale": 1.0, "consumed_samples": 942848, "global_step/max_steps": "3683/6362"} +{"lm loss": 4.90578461, "grad_norm": 0.46319908, "learning_rate": 4.288e-05, "elapsed_time_per_iteration": 6.68490481, "memory(GiB)": 21.51, "elapsed_time": "6h 43m 39s", "remaining_time": "4h 53m 25s", "loss_scale": 1.0, "consumed_samples": 943104, "global_step/max_steps": "3684/6362"} +{"lm loss": 4.90714455, "grad_norm": 0.48979908, "learning_rate": 4.285e-05, "elapsed_time_per_iteration": 6.53855228, "memory(GiB)": 21.51, "elapsed_time": "6h 43m 45s", "remaining_time": "4h 53m 18s", "loss_scale": 1.0, "consumed_samples": 943360, "global_step/max_steps": "3685/6362"} +{"lm loss": 4.92815542, "grad_norm": 0.43204942, "learning_rate": 4.283e-05, "elapsed_time_per_iteration": 6.64720535, "memory(GiB)": 21.51, "elapsed_time": "6h 43m 52s", "remaining_time": "4h 53m 12s", "loss_scale": 1.0, "consumed_samples": 943616, "global_step/max_steps": "3686/6362"} +{"lm loss": 4.92882347, "grad_norm": 0.47052613, "learning_rate": 4.28e-05, "elapsed_time_per_iteration": 6.39656186, "memory(GiB)": 21.51, "elapsed_time": "6h 43m 58s", "remaining_time": "4h 53m 5s", "loss_scale": 1.0, "consumed_samples": 943872, "global_step/max_steps": "3687/6362"} +{"lm loss": 4.90015459, "grad_norm": 0.50366402, "learning_rate": 4.278e-05, "elapsed_time_per_iteration": 6.5405736, "memory(GiB)": 21.51, "elapsed_time": "6h 44m 5s", "remaining_time": "4h 52m 59s", "loss_scale": 1.0, "consumed_samples": 944128, "global_step/max_steps": "3688/6362"} +{"lm loss": 4.90780592, "grad_norm": 0.4633503, "learning_rate": 4.275e-05, "elapsed_time_per_iteration": 6.69529986, "memory(GiB)": 21.51, "elapsed_time": "6h 44m 11s", "remaining_time": "4h 52m 52s", "loss_scale": 1.0, "consumed_samples": 944384, "global_step/max_steps": "3689/6362"} +{"lm loss": 4.92070866, "grad_norm": 0.43239039, "learning_rate": 4.273e-05, "elapsed_time_per_iteration": 6.48632264, "memory(GiB)": 21.51, "elapsed_time": "6h 44m 18s", "remaining_time": "4h 52m 45s", "loss_scale": 1.0, "consumed_samples": 944640, "global_step/max_steps": "3690/6362"} +{"lm loss": 4.89719915, "grad_norm": 0.47112885, "learning_rate": 4.27e-05, "elapsed_time_per_iteration": 6.79749823, "memory(GiB)": 21.51, "elapsed_time": "6h 44m 25s", "remaining_time": "4h 52m 39s", "loss_scale": 1.0, "consumed_samples": 944896, "global_step/max_steps": "3691/6362"} +{"lm loss": 4.89130783, "grad_norm": 0.43073595, "learning_rate": 4.268e-05, "elapsed_time_per_iteration": 6.66682172, "memory(GiB)": 21.51, "elapsed_time": "6h 44m 31s", "remaining_time": "4h 52m 33s", "loss_scale": 1.0, "consumed_samples": 945152, "global_step/max_steps": "3692/6362"} +{"lm loss": 4.91498899, "grad_norm": 0.47386917, "learning_rate": 4.265e-05, "elapsed_time_per_iteration": 6.47657657, "memory(GiB)": 21.51, "elapsed_time": "6h 44m 38s", "remaining_time": "4h 52m 26s", "loss_scale": 1.0, "consumed_samples": 945408, "global_step/max_steps": "3693/6362"} +{"lm loss": 4.90733767, "grad_norm": 0.44949335, "learning_rate": 4.263e-05, "elapsed_time_per_iteration": 6.54331064, "memory(GiB)": 21.51, "elapsed_time": "6h 44m 44s", "remaining_time": "4h 52m 19s", "loss_scale": 1.0, "consumed_samples": 945664, "global_step/max_steps": "3694/6362"} +{"lm loss": 4.8843236, "grad_norm": 0.41580668, "learning_rate": 4.26e-05, "elapsed_time_per_iteration": 6.62414241, "memory(GiB)": 21.51, "elapsed_time": "6h 44m 51s", "remaining_time": "4h 52m 13s", "loss_scale": 1.0, "consumed_samples": 945920, "global_step/max_steps": "3695/6362"} +{"lm loss": 4.90272999, "grad_norm": 0.42138478, "learning_rate": 4.258e-05, "elapsed_time_per_iteration": 6.61100078, "memory(GiB)": 21.51, "elapsed_time": "6h 44m 58s", "remaining_time": "4h 52m 6s", "loss_scale": 1.0, "consumed_samples": 946176, "global_step/max_steps": "3696/6362"} +{"lm loss": 4.90042639, "grad_norm": 0.42585135, "learning_rate": 4.255e-05, "elapsed_time_per_iteration": 6.67731142, "memory(GiB)": 21.51, "elapsed_time": "6h 45m 4s", "remaining_time": "4h 52m 0s", "loss_scale": 1.0, "consumed_samples": 946432, "global_step/max_steps": "3697/6362"} +{"lm loss": 4.93629456, "grad_norm": 0.42484865, "learning_rate": 4.253e-05, "elapsed_time_per_iteration": 6.45919228, "memory(GiB)": 21.51, "elapsed_time": "6h 45m 11s", "remaining_time": "4h 51m 53s", "loss_scale": 1.0, "consumed_samples": 946688, "global_step/max_steps": "3698/6362"} +{"lm loss": 4.92493439, "grad_norm": 0.42056951, "learning_rate": 4.25e-05, "elapsed_time_per_iteration": 6.63053608, "memory(GiB)": 21.51, "elapsed_time": "6h 45m 17s", "remaining_time": "4h 51m 47s", "loss_scale": 1.0, "consumed_samples": 946944, "global_step/max_steps": "3699/6362"} +{"lm loss": 4.92984819, "grad_norm": 0.47316828, "learning_rate": 4.248e-05, "elapsed_time_per_iteration": 6.42965126, "memory(GiB)": 21.51, "elapsed_time": "6h 45m 24s", "remaining_time": "4h 51m 40s", "loss_scale": 1.0, "consumed_samples": 947200, "global_step/max_steps": "3700/6362"} +{"lm loss": 4.90490961, "grad_norm": 0.44034779, "learning_rate": 4.245e-05, "elapsed_time_per_iteration": 6.43287182, "memory(GiB)": 21.51, "elapsed_time": "6h 45m 30s", "remaining_time": "4h 51m 33s", "loss_scale": 1.0, "consumed_samples": 947456, "global_step/max_steps": "3701/6362"} +{"lm loss": 4.91609001, "grad_norm": 0.41703835, "learning_rate": 4.243e-05, "elapsed_time_per_iteration": 6.39969492, "memory(GiB)": 21.51, "elapsed_time": "6h 45m 37s", "remaining_time": "4h 51m 26s", "loss_scale": 1.0, "consumed_samples": 947712, "global_step/max_steps": "3702/6362"} +{"lm loss": 4.92062283, "grad_norm": 0.41158593, "learning_rate": 4.24e-05, "elapsed_time_per_iteration": 6.48192906, "memory(GiB)": 21.51, "elapsed_time": "6h 45m 43s", "remaining_time": "4h 51m 20s", "loss_scale": 1.0, "consumed_samples": 947968, "global_step/max_steps": "3703/6362"} +{"lm loss": 4.89207983, "grad_norm": 0.42157757, "learning_rate": 4.238e-05, "elapsed_time_per_iteration": 6.46855259, "memory(GiB)": 21.51, "elapsed_time": "6h 45m 50s", "remaining_time": "4h 51m 13s", "loss_scale": 1.0, "consumed_samples": 948224, "global_step/max_steps": "3704/6362"} +{"lm loss": 4.92563152, "grad_norm": 0.46583369, "learning_rate": 4.235e-05, "elapsed_time_per_iteration": 6.41333747, "memory(GiB)": 21.51, "elapsed_time": "6h 45m 56s", "remaining_time": "4h 51m 6s", "loss_scale": 1.0, "consumed_samples": 948480, "global_step/max_steps": "3705/6362"} +{"lm loss": 4.91105509, "grad_norm": 0.4641307, "learning_rate": 4.233e-05, "elapsed_time_per_iteration": 6.29012346, "memory(GiB)": 21.51, "elapsed_time": "6h 46m 2s", "remaining_time": "4h 51m 0s", "loss_scale": 1.0, "consumed_samples": 948736, "global_step/max_steps": "3706/6362"} +{"lm loss": 4.9173789, "grad_norm": 0.43725935, "learning_rate": 4.231e-05, "elapsed_time_per_iteration": 6.55619431, "memory(GiB)": 21.51, "elapsed_time": "6h 46m 9s", "remaining_time": "4h 50m 53s", "loss_scale": 1.0, "consumed_samples": 948992, "global_step/max_steps": "3707/6362"} +{"lm loss": 4.9150219, "grad_norm": 0.39272469, "learning_rate": 4.228e-05, "elapsed_time_per_iteration": 6.3005023, "memory(GiB)": 21.51, "elapsed_time": "6h 46m 15s", "remaining_time": "4h 50m 46s", "loss_scale": 1.0, "consumed_samples": 949248, "global_step/max_steps": "3708/6362"} +{"lm loss": 4.91477919, "grad_norm": 0.42147413, "learning_rate": 4.226e-05, "elapsed_time_per_iteration": 6.38683891, "memory(GiB)": 21.51, "elapsed_time": "6h 46m 22s", "remaining_time": "4h 50m 40s", "loss_scale": 1.0, "consumed_samples": 949504, "global_step/max_steps": "3709/6362"} +{"lm loss": 4.90847778, "grad_norm": 0.44326964, "learning_rate": 4.223e-05, "elapsed_time_per_iteration": 6.47177529, "memory(GiB)": 21.51, "elapsed_time": "6h 46m 28s", "remaining_time": "4h 50m 33s", "loss_scale": 1.0, "consumed_samples": 949760, "global_step/max_steps": "3710/6362"} +{"lm loss": 4.90353632, "grad_norm": 0.42575651, "learning_rate": 4.221e-05, "elapsed_time_per_iteration": 6.35857773, "memory(GiB)": 21.51, "elapsed_time": "6h 46m 34s", "remaining_time": "4h 50m 26s", "loss_scale": 1.0, "consumed_samples": 950016, "global_step/max_steps": "3711/6362"} +{"lm loss": 4.91022015, "grad_norm": 0.42775816, "learning_rate": 4.218e-05, "elapsed_time_per_iteration": 6.62798285, "memory(GiB)": 21.51, "elapsed_time": "6h 46m 41s", "remaining_time": "4h 50m 20s", "loss_scale": 1.0, "consumed_samples": 950272, "global_step/max_steps": "3712/6362"} +{"lm loss": 4.92941618, "grad_norm": 0.39514109, "learning_rate": 4.216e-05, "elapsed_time_per_iteration": 6.69539595, "memory(GiB)": 21.51, "elapsed_time": "6h 46m 48s", "remaining_time": "4h 50m 13s", "loss_scale": 1.0, "consumed_samples": 950528, "global_step/max_steps": "3713/6362"} +{"lm loss": 4.90147161, "grad_norm": 0.42371541, "learning_rate": 4.213e-05, "elapsed_time_per_iteration": 6.59167123, "memory(GiB)": 21.51, "elapsed_time": "6h 46m 54s", "remaining_time": "4h 50m 7s", "loss_scale": 1.0, "consumed_samples": 950784, "global_step/max_steps": "3714/6362"} +{"lm loss": 4.91081333, "grad_norm": 0.44302425, "learning_rate": 4.211e-05, "elapsed_time_per_iteration": 6.36319494, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 1s", "remaining_time": "4h 50m 0s", "loss_scale": 1.0, "consumed_samples": 951040, "global_step/max_steps": "3715/6362"} +{"lm loss": 4.91252995, "grad_norm": 0.42561266, "learning_rate": 4.208e-05, "elapsed_time_per_iteration": 6.47656488, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 7s", "remaining_time": "4h 49m 53s", "loss_scale": 1.0, "consumed_samples": 951296, "global_step/max_steps": "3716/6362"} +{"lm loss": 4.91542387, "grad_norm": 0.40233278, "learning_rate": 4.206e-05, "elapsed_time_per_iteration": 6.37900448, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 13s", "remaining_time": "4h 49m 47s", "loss_scale": 1.0, "consumed_samples": 951552, "global_step/max_steps": "3717/6362"} +{"lm loss": 4.91120148, "grad_norm": 0.46315756, "learning_rate": 4.203e-05, "elapsed_time_per_iteration": 6.36181808, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 20s", "remaining_time": "4h 49m 40s", "loss_scale": 1.0, "consumed_samples": 951808, "global_step/max_steps": "3718/6362"} +{"lm loss": 4.89395475, "grad_norm": 0.53543693, "learning_rate": 4.201e-05, "elapsed_time_per_iteration": 6.49251819, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 26s", "remaining_time": "4h 49m 33s", "loss_scale": 1.0, "consumed_samples": 952064, "global_step/max_steps": "3719/6362"} +{"lm loss": 4.92182302, "grad_norm": 0.54478854, "learning_rate": 4.198e-05, "elapsed_time_per_iteration": 6.50948071, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 33s", "remaining_time": "4h 49m 27s", "loss_scale": 1.0, "consumed_samples": 952320, "global_step/max_steps": "3720/6362"} +{"lm loss": 4.92291355, "grad_norm": 0.48935828, "learning_rate": 4.196e-05, "elapsed_time_per_iteration": 6.50397801, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 39s", "remaining_time": "4h 49m 20s", "loss_scale": 1.0, "consumed_samples": 952576, "global_step/max_steps": "3721/6362"} +{"lm loss": 4.86125851, "grad_norm": 0.47095677, "learning_rate": 4.193e-05, "elapsed_time_per_iteration": 6.63709092, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 46s", "remaining_time": "4h 49m 13s", "loss_scale": 1.0, "consumed_samples": 952832, "global_step/max_steps": "3722/6362"} +{"lm loss": 4.91115093, "grad_norm": 0.45037603, "learning_rate": 4.191e-05, "elapsed_time_per_iteration": 6.74580407, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 53s", "remaining_time": "4h 49m 7s", "loss_scale": 1.0, "consumed_samples": 953088, "global_step/max_steps": "3723/6362"} +{"lm loss": 4.91691494, "grad_norm": 0.49227473, "learning_rate": 4.188e-05, "elapsed_time_per_iteration": 6.58890486, "memory(GiB)": 21.51, "elapsed_time": "6h 47m 59s", "remaining_time": "4h 49m 0s", "loss_scale": 1.0, "consumed_samples": 953344, "global_step/max_steps": "3724/6362"} +{"lm loss": 4.90835619, "grad_norm": 0.4343859, "learning_rate": 4.186e-05, "elapsed_time_per_iteration": 6.68805218, "memory(GiB)": 21.51, "elapsed_time": "6h 48m 6s", "remaining_time": "4h 48m 54s", "loss_scale": 1.0, "consumed_samples": 953600, "global_step/max_steps": "3725/6362"} +{"lm loss": 4.89274359, "grad_norm": 0.4972119, "learning_rate": 4.184e-05, "elapsed_time_per_iteration": 6.65069818, "memory(GiB)": 21.51, "elapsed_time": "6h 48m 13s", "remaining_time": "4h 48m 47s", "loss_scale": 1.0, "consumed_samples": 953856, "global_step/max_steps": "3726/6362"} +{"lm loss": 4.88897991, "grad_norm": 0.48639053, "learning_rate": 4.181e-05, "elapsed_time_per_iteration": 6.76720119, "memory(GiB)": 21.51, "elapsed_time": "6h 48m 19s", "remaining_time": "4h 48m 41s", "loss_scale": 1.0, "consumed_samples": 954112, "global_step/max_steps": "3727/6362"} +{"lm loss": 4.92823792, "grad_norm": 0.44074896, "learning_rate": 4.179e-05, "elapsed_time_per_iteration": 6.52657747, "memory(GiB)": 21.51, "elapsed_time": "6h 48m 26s", "remaining_time": "4h 48m 34s", "loss_scale": 1.0, "consumed_samples": 954368, "global_step/max_steps": "3728/6362"} +{"lm loss": 4.91206455, "grad_norm": 0.44347948, "learning_rate": 4.176e-05, "elapsed_time_per_iteration": 6.61395788, "memory(GiB)": 21.51, "elapsed_time": "6h 48m 33s", "remaining_time": "4h 48m 28s", "loss_scale": 1.0, "consumed_samples": 954624, "global_step/max_steps": "3729/6362"} +{"lm loss": 4.92845678, "grad_norm": 0.50257427, "learning_rate": 4.174e-05, "elapsed_time_per_iteration": 6.66782594, "memory(GiB)": 21.51, "elapsed_time": "6h 48m 39s", "remaining_time": "4h 48m 21s", "loss_scale": 1.0, "consumed_samples": 954880, "global_step/max_steps": "3730/6362"} +{"lm loss": 4.92472029, "grad_norm": 0.45281509, "learning_rate": 4.171e-05, "elapsed_time_per_iteration": 6.63716269, "memory(GiB)": 21.51, "elapsed_time": "6h 48m 46s", "remaining_time": "4h 48m 15s", "loss_scale": 1.0, "consumed_samples": 955136, "global_step/max_steps": "3731/6362"} +{"lm loss": 4.93105841, "grad_norm": 0.44879088, "learning_rate": 4.169e-05, "elapsed_time_per_iteration": 6.69772243, "memory(GiB)": 21.51, "elapsed_time": "6h 48m 53s", "remaining_time": "4h 48m 8s", "loss_scale": 1.0, "consumed_samples": 955392, "global_step/max_steps": "3732/6362"} +{"lm loss": 4.88754988, "grad_norm": 0.40732196, "learning_rate": 4.166e-05, "elapsed_time_per_iteration": 6.52167654, "memory(GiB)": 21.51, "elapsed_time": "6h 48m 59s", "remaining_time": "4h 48m 2s", "loss_scale": 1.0, "consumed_samples": 955648, "global_step/max_steps": "3733/6362"} +{"lm loss": 4.91021633, "grad_norm": 0.4160026, "learning_rate": 4.164e-05, "elapsed_time_per_iteration": 6.74637365, "memory(GiB)": 21.51, "elapsed_time": "6h 49m 6s", "remaining_time": "4h 47m 55s", "loss_scale": 1.0, "consumed_samples": 955904, "global_step/max_steps": "3734/6362"} +{"lm loss": 4.89141035, "grad_norm": 0.46919131, "learning_rate": 4.161e-05, "elapsed_time_per_iteration": 6.79925108, "memory(GiB)": 21.51, "elapsed_time": "6h 49m 13s", "remaining_time": "4h 47m 49s", "loss_scale": 1.0, "consumed_samples": 956160, "global_step/max_steps": "3735/6362"} +{"lm loss": 4.92592812, "grad_norm": 0.45776975, "learning_rate": 4.159e-05, "elapsed_time_per_iteration": 6.47428441, "memory(GiB)": 21.51, "elapsed_time": "6h 49m 19s", "remaining_time": "4h 47m 42s", "loss_scale": 1.0, "consumed_samples": 956416, "global_step/max_steps": "3736/6362"} +{"lm loss": 4.92909288, "grad_norm": 0.47200167, "learning_rate": 4.156e-05, "elapsed_time_per_iteration": 6.54776812, "memory(GiB)": 21.51, "elapsed_time": "6h 49m 26s", "remaining_time": "4h 47m 36s", "loss_scale": 1.0, "consumed_samples": 956672, "global_step/max_steps": "3737/6362"} +{"lm loss": 4.92938709, "grad_norm": 0.38828161, "learning_rate": 4.154e-05, "elapsed_time_per_iteration": 6.43208265, "memory(GiB)": 21.51, "elapsed_time": "6h 49m 32s", "remaining_time": "4h 47m 29s", "loss_scale": 1.0, "consumed_samples": 956928, "global_step/max_steps": "3738/6362"} +{"lm loss": 4.94007349, "grad_norm": 0.44614622, "learning_rate": 4.151e-05, "elapsed_time_per_iteration": 6.83686328, "memory(GiB)": 21.51, "elapsed_time": "6h 49m 39s", "remaining_time": "4h 47m 23s", "loss_scale": 1.0, "consumed_samples": 957184, "global_step/max_steps": "3739/6362"} +{"lm loss": 4.92969847, "grad_norm": 0.49002174, "learning_rate": 4.149e-05, "elapsed_time_per_iteration": 6.44785023, "memory(GiB)": 21.51, "elapsed_time": "6h 49m 45s", "remaining_time": "4h 47m 16s", "loss_scale": 1.0, "consumed_samples": 957440, "global_step/max_steps": "3740/6362"} +{"lm loss": 4.88611126, "grad_norm": 0.48548117, "learning_rate": 4.147e-05, "elapsed_time_per_iteration": 6.57274032, "memory(GiB)": 21.51, "elapsed_time": "6h 49m 52s", "remaining_time": "4h 47m 9s", "loss_scale": 1.0, "consumed_samples": 957696, "global_step/max_steps": "3741/6362"} +{"lm loss": 4.92645788, "grad_norm": 0.43890971, "learning_rate": 4.144e-05, "elapsed_time_per_iteration": 6.46474338, "memory(GiB)": 21.51, "elapsed_time": "6h 49m 58s", "remaining_time": "4h 47m 3s", "loss_scale": 1.0, "consumed_samples": 957952, "global_step/max_steps": "3742/6362"} +{"lm loss": 4.89770412, "grad_norm": 0.42523009, "learning_rate": 4.142e-05, "elapsed_time_per_iteration": 6.82837224, "memory(GiB)": 21.51, "elapsed_time": "6h 50m 5s", "remaining_time": "4h 46m 56s", "loss_scale": 1.0, "consumed_samples": 958208, "global_step/max_steps": "3743/6362"} +{"lm loss": 4.91089439, "grad_norm": 0.46788219, "learning_rate": 4.139e-05, "elapsed_time_per_iteration": 6.67091894, "memory(GiB)": 21.51, "elapsed_time": "6h 50m 12s", "remaining_time": "4h 46m 50s", "loss_scale": 1.0, "consumed_samples": 958464, "global_step/max_steps": "3744/6362"} +{"lm loss": 4.92446041, "grad_norm": 0.48356017, "learning_rate": 4.137e-05, "elapsed_time_per_iteration": 6.88288879, "memory(GiB)": 21.51, "elapsed_time": "6h 50m 19s", "remaining_time": "4h 46m 43s", "loss_scale": 1.0, "consumed_samples": 958720, "global_step/max_steps": "3745/6362"} +{"lm loss": 4.89797544, "grad_norm": 0.43409353, "learning_rate": 4.134e-05, "elapsed_time_per_iteration": 6.63762474, "memory(GiB)": 21.51, "elapsed_time": "6h 50m 25s", "remaining_time": "4h 46m 37s", "loss_scale": 1.0, "consumed_samples": 958976, "global_step/max_steps": "3746/6362"} +{"lm loss": 4.93259001, "grad_norm": 0.44011462, "learning_rate": 4.132e-05, "elapsed_time_per_iteration": 6.68617988, "memory(GiB)": 21.51, "elapsed_time": "6h 50m 32s", "remaining_time": "4h 46m 30s", "loss_scale": 1.0, "consumed_samples": 959232, "global_step/max_steps": "3747/6362"} +{"lm loss": 4.90123177, "grad_norm": 0.45743081, "learning_rate": 4.129e-05, "elapsed_time_per_iteration": 6.83039212, "memory(GiB)": 21.51, "elapsed_time": "6h 50m 39s", "remaining_time": "4h 46m 24s", "loss_scale": 1.0, "consumed_samples": 959488, "global_step/max_steps": "3748/6362"} +{"lm loss": 4.88987017, "grad_norm": 0.51085234, "learning_rate": 4.127e-05, "elapsed_time_per_iteration": 6.63625383, "memory(GiB)": 21.51, "elapsed_time": "6h 50m 46s", "remaining_time": "4h 46m 17s", "loss_scale": 1.0, "consumed_samples": 959744, "global_step/max_steps": "3749/6362"} +{"lm loss": 4.91258049, "grad_norm": 0.44926092, "learning_rate": 4.124e-05, "elapsed_time_per_iteration": 6.58134079, "memory(GiB)": 21.51, "elapsed_time": "6h 50m 52s", "remaining_time": "4h 46m 11s", "loss_scale": 1.0, "consumed_samples": 960000, "global_step/max_steps": "3750/6362"} +{"lm loss": 4.90682507, "grad_norm": 0.47987688, "learning_rate": 4.122e-05, "elapsed_time_per_iteration": 6.76182508, "memory(GiB)": 21.51, "elapsed_time": "6h 50m 59s", "remaining_time": "4h 46m 4s", "loss_scale": 1.0, "consumed_samples": 960256, "global_step/max_steps": "3751/6362"} +{"lm loss": 4.9376049, "grad_norm": 0.49203894, "learning_rate": 4.119e-05, "elapsed_time_per_iteration": 6.64988613, "memory(GiB)": 21.51, "elapsed_time": "6h 51m 6s", "remaining_time": "4h 45m 58s", "loss_scale": 1.0, "consumed_samples": 960512, "global_step/max_steps": "3752/6362"} +{"lm loss": 4.9037652, "grad_norm": 0.49499851, "learning_rate": 4.117e-05, "elapsed_time_per_iteration": 6.44564605, "memory(GiB)": 21.51, "elapsed_time": "6h 51m 12s", "remaining_time": "4h 45m 51s", "loss_scale": 1.0, "consumed_samples": 960768, "global_step/max_steps": "3753/6362"} +{"lm loss": 4.91117716, "grad_norm": 0.48978639, "learning_rate": 4.114e-05, "elapsed_time_per_iteration": 6.51617622, "memory(GiB)": 21.51, "elapsed_time": "6h 51m 19s", "remaining_time": "4h 45m 45s", "loss_scale": 1.0, "consumed_samples": 961024, "global_step/max_steps": "3754/6362"} +{"lm loss": 4.93524885, "grad_norm": 0.40952069, "learning_rate": 4.112e-05, "elapsed_time_per_iteration": 6.66263723, "memory(GiB)": 21.51, "elapsed_time": "6h 51m 25s", "remaining_time": "4h 45m 38s", "loss_scale": 1.0, "consumed_samples": 961280, "global_step/max_steps": "3755/6362"} +{"lm loss": 4.91187, "grad_norm": 0.48187956, "learning_rate": 4.11e-05, "elapsed_time_per_iteration": 6.44568825, "memory(GiB)": 21.51, "elapsed_time": "6h 51m 32s", "remaining_time": "4h 45m 31s", "loss_scale": 1.0, "consumed_samples": 961536, "global_step/max_steps": "3756/6362"} +{"lm loss": 4.91895771, "grad_norm": 0.48753318, "learning_rate": 4.107e-05, "elapsed_time_per_iteration": 6.66907787, "memory(GiB)": 21.51, "elapsed_time": "6h 51m 38s", "remaining_time": "4h 45m 25s", "loss_scale": 1.0, "consumed_samples": 961792, "global_step/max_steps": "3757/6362"} +{"lm loss": 4.91873026, "grad_norm": 0.50862324, "learning_rate": 4.105e-05, "elapsed_time_per_iteration": 6.52685547, "memory(GiB)": 21.51, "elapsed_time": "6h 51m 45s", "remaining_time": "4h 45m 18s", "loss_scale": 1.0, "consumed_samples": 962048, "global_step/max_steps": "3758/6362"} +{"lm loss": 4.90470314, "grad_norm": 0.45483974, "learning_rate": 4.102e-05, "elapsed_time_per_iteration": 6.70917988, "memory(GiB)": 21.51, "elapsed_time": "6h 51m 52s", "remaining_time": "4h 45m 12s", "loss_scale": 1.0, "consumed_samples": 962304, "global_step/max_steps": "3759/6362"} +{"lm loss": 4.87419891, "grad_norm": 0.48412377, "learning_rate": 4.1e-05, "elapsed_time_per_iteration": 6.4530549, "memory(GiB)": 21.51, "elapsed_time": "6h 51m 58s", "remaining_time": "4h 45m 5s", "loss_scale": 1.0, "consumed_samples": 962560, "global_step/max_steps": "3760/6362"} +{"lm loss": 4.91454887, "grad_norm": 0.49990872, "learning_rate": 4.097e-05, "elapsed_time_per_iteration": 6.41742682, "memory(GiB)": 21.51, "elapsed_time": "6h 52m 4s", "remaining_time": "4h 44m 59s", "loss_scale": 1.0, "consumed_samples": 962816, "global_step/max_steps": "3761/6362"} +{"lm loss": 4.90675545, "grad_norm": 0.50862408, "learning_rate": 4.095e-05, "elapsed_time_per_iteration": 6.5469768, "memory(GiB)": 21.51, "elapsed_time": "6h 52m 11s", "remaining_time": "4h 44m 52s", "loss_scale": 1.0, "consumed_samples": 963072, "global_step/max_steps": "3762/6362"} +{"lm loss": 4.92718267, "grad_norm": 0.42900169, "learning_rate": 4.092e-05, "elapsed_time_per_iteration": 6.55353355, "memory(GiB)": 21.51, "elapsed_time": "6h 52m 18s", "remaining_time": "4h 44m 45s", "loss_scale": 1.0, "consumed_samples": 963328, "global_step/max_steps": "3763/6362"} +{"lm loss": 4.92763519, "grad_norm": 0.47433829, "learning_rate": 4.09e-05, "elapsed_time_per_iteration": 6.39525151, "memory(GiB)": 21.51, "elapsed_time": "6h 52m 24s", "remaining_time": "4h 44m 39s", "loss_scale": 1.0, "consumed_samples": 963584, "global_step/max_steps": "3764/6362"} +{"lm loss": 4.91117096, "grad_norm": 0.44631827, "learning_rate": 4.087e-05, "elapsed_time_per_iteration": 6.40525126, "memory(GiB)": 21.51, "elapsed_time": "6h 52m 30s", "remaining_time": "4h 44m 32s", "loss_scale": 1.0, "consumed_samples": 963840, "global_step/max_steps": "3765/6362"} +{"lm loss": 4.91742706, "grad_norm": 0.48874202, "learning_rate": 4.085e-05, "elapsed_time_per_iteration": 6.52485037, "memory(GiB)": 21.51, "elapsed_time": "6h 52m 37s", "remaining_time": "4h 44m 25s", "loss_scale": 1.0, "consumed_samples": 964096, "global_step/max_steps": "3766/6362"} +{"lm loss": 4.91758585, "grad_norm": 0.4431195, "learning_rate": 4.082e-05, "elapsed_time_per_iteration": 6.46842384, "memory(GiB)": 21.51, "elapsed_time": "6h 52m 43s", "remaining_time": "4h 44m 19s", "loss_scale": 1.0, "consumed_samples": 964352, "global_step/max_steps": "3767/6362"} +{"lm loss": 4.90825748, "grad_norm": 0.4336679, "learning_rate": 4.08e-05, "elapsed_time_per_iteration": 6.36035705, "memory(GiB)": 21.51, "elapsed_time": "6h 52m 50s", "remaining_time": "4h 44m 12s", "loss_scale": 1.0, "consumed_samples": 964608, "global_step/max_steps": "3768/6362"} +{"lm loss": 4.91411924, "grad_norm": 0.47004536, "learning_rate": 4.078e-05, "elapsed_time_per_iteration": 6.48580694, "memory(GiB)": 21.51, "elapsed_time": "6h 52m 56s", "remaining_time": "4h 44m 5s", "loss_scale": 1.0, "consumed_samples": 964864, "global_step/max_steps": "3769/6362"} +{"lm loss": 4.92394972, "grad_norm": 0.42118812, "learning_rate": 4.075e-05, "elapsed_time_per_iteration": 6.34603906, "memory(GiB)": 21.51, "elapsed_time": "6h 53m 3s", "remaining_time": "4h 43m 59s", "loss_scale": 1.0, "consumed_samples": 965120, "global_step/max_steps": "3770/6362"} +{"lm loss": 4.90854836, "grad_norm": 0.42877406, "learning_rate": 4.073e-05, "elapsed_time_per_iteration": 6.47862887, "memory(GiB)": 21.51, "elapsed_time": "6h 53m 9s", "remaining_time": "4h 43m 52s", "loss_scale": 1.0, "consumed_samples": 965376, "global_step/max_steps": "3771/6362"} +{"lm loss": 4.90205193, "grad_norm": 0.43924585, "learning_rate": 4.07e-05, "elapsed_time_per_iteration": 6.55636024, "memory(GiB)": 21.51, "elapsed_time": "6h 53m 16s", "remaining_time": "4h 43m 45s", "loss_scale": 1.0, "consumed_samples": 965632, "global_step/max_steps": "3772/6362"} +{"lm loss": 4.90798283, "grad_norm": 0.43331143, "learning_rate": 4.068e-05, "elapsed_time_per_iteration": 6.25602722, "memory(GiB)": 21.51, "elapsed_time": "6h 53m 22s", "remaining_time": "4h 43m 39s", "loss_scale": 1.0, "consumed_samples": 965888, "global_step/max_steps": "3773/6362"} +{"lm loss": 4.90737963, "grad_norm": 0.41815135, "learning_rate": 4.065e-05, "elapsed_time_per_iteration": 6.46736884, "memory(GiB)": 21.51, "elapsed_time": "6h 53m 28s", "remaining_time": "4h 43m 32s", "loss_scale": 1.0, "consumed_samples": 966144, "global_step/max_steps": "3774/6362"} +{"lm loss": 4.89451408, "grad_norm": 0.46129167, "learning_rate": 4.063e-05, "elapsed_time_per_iteration": 6.39551687, "memory(GiB)": 21.51, "elapsed_time": "6h 53m 35s", "remaining_time": "4h 43m 25s", "loss_scale": 1.0, "consumed_samples": 966400, "global_step/max_steps": "3775/6362"} +{"lm loss": 4.90736485, "grad_norm": 0.42991439, "learning_rate": 4.06e-05, "elapsed_time_per_iteration": 6.60459161, "memory(GiB)": 21.51, "elapsed_time": "6h 53m 41s", "remaining_time": "4h 43m 19s", "loss_scale": 1.0, "consumed_samples": 966656, "global_step/max_steps": "3776/6362"} +{"lm loss": 4.89759731, "grad_norm": 0.41463098, "learning_rate": 4.058e-05, "elapsed_time_per_iteration": 6.8699038, "memory(GiB)": 21.51, "elapsed_time": "6h 53m 48s", "remaining_time": "4h 43m 12s", "loss_scale": 1.0, "consumed_samples": 966912, "global_step/max_steps": "3777/6362"} +{"lm loss": 4.92749023, "grad_norm": 0.47222385, "learning_rate": 4.055e-05, "elapsed_time_per_iteration": 6.46651602, "memory(GiB)": 21.51, "elapsed_time": "6h 53m 55s", "remaining_time": "4h 43m 6s", "loss_scale": 1.0, "consumed_samples": 967168, "global_step/max_steps": "3778/6362"} +{"lm loss": 4.89224577, "grad_norm": 0.47452572, "learning_rate": 4.053e-05, "elapsed_time_per_iteration": 6.6903801, "memory(GiB)": 21.51, "elapsed_time": "6h 54m 1s", "remaining_time": "4h 42m 59s", "loss_scale": 1.0, "consumed_samples": 967424, "global_step/max_steps": "3779/6362"} +{"lm loss": 4.91425371, "grad_norm": 0.46087483, "learning_rate": 4.051e-05, "elapsed_time_per_iteration": 6.82929206, "memory(GiB)": 21.51, "elapsed_time": "6h 54m 8s", "remaining_time": "4h 42m 53s", "loss_scale": 1.0, "consumed_samples": 967680, "global_step/max_steps": "3780/6362"} +{"lm loss": 4.91075754, "grad_norm": 0.43960142, "learning_rate": 4.048e-05, "elapsed_time_per_iteration": 6.5774188, "memory(GiB)": 21.51, "elapsed_time": "6h 54m 15s", "remaining_time": "4h 42m 46s", "loss_scale": 1.0, "consumed_samples": 967936, "global_step/max_steps": "3781/6362"} +{"lm loss": 4.92474651, "grad_norm": 0.45455784, "learning_rate": 4.046e-05, "elapsed_time_per_iteration": 6.442029, "memory(GiB)": 21.51, "elapsed_time": "6h 54m 21s", "remaining_time": "4h 42m 40s", "loss_scale": 1.0, "consumed_samples": 968192, "global_step/max_steps": "3782/6362"} +{"lm loss": 4.92734718, "grad_norm": 0.42767787, "learning_rate": 4.043e-05, "elapsed_time_per_iteration": 6.57797527, "memory(GiB)": 21.51, "elapsed_time": "6h 54m 28s", "remaining_time": "4h 42m 33s", "loss_scale": 1.0, "consumed_samples": 968448, "global_step/max_steps": "3783/6362"} +{"lm loss": 4.9058938, "grad_norm": 0.46094447, "learning_rate": 4.041e-05, "elapsed_time_per_iteration": 6.77720976, "memory(GiB)": 21.51, "elapsed_time": "6h 54m 35s", "remaining_time": "4h 42m 27s", "loss_scale": 1.0, "consumed_samples": 968704, "global_step/max_steps": "3784/6362"} +{"lm loss": 4.88554001, "grad_norm": 0.49579936, "learning_rate": 4.038e-05, "elapsed_time_per_iteration": 6.51532531, "memory(GiB)": 21.51, "elapsed_time": "6h 54m 41s", "remaining_time": "4h 42m 20s", "loss_scale": 1.0, "consumed_samples": 968960, "global_step/max_steps": "3785/6362"} +{"lm loss": 4.89844275, "grad_norm": 0.48811495, "learning_rate": 4.036e-05, "elapsed_time_per_iteration": 6.79467893, "memory(GiB)": 21.51, "elapsed_time": "6h 54m 48s", "remaining_time": "4h 42m 14s", "loss_scale": 1.0, "consumed_samples": 969216, "global_step/max_steps": "3786/6362"} +{"lm loss": 4.91991043, "grad_norm": 0.49633503, "learning_rate": 4.033e-05, "elapsed_time_per_iteration": 6.50031304, "memory(GiB)": 21.51, "elapsed_time": "6h 54m 54s", "remaining_time": "4h 42m 7s", "loss_scale": 1.0, "consumed_samples": 969472, "global_step/max_steps": "3787/6362"} +{"lm loss": 4.91213799, "grad_norm": 0.44944876, "learning_rate": 4.031e-05, "elapsed_time_per_iteration": 6.7833674, "memory(GiB)": 21.51, "elapsed_time": "6h 55m 1s", "remaining_time": "4h 42m 0s", "loss_scale": 1.0, "consumed_samples": 969728, "global_step/max_steps": "3788/6362"} +{"lm loss": 4.89219809, "grad_norm": 0.47450763, "learning_rate": 4.028e-05, "elapsed_time_per_iteration": 6.72898483, "memory(GiB)": 21.51, "elapsed_time": "6h 55m 8s", "remaining_time": "4h 41m 54s", "loss_scale": 1.0, "consumed_samples": 969984, "global_step/max_steps": "3789/6362"} +{"lm loss": 4.89209747, "grad_norm": 0.4421424, "learning_rate": 4.026e-05, "elapsed_time_per_iteration": 6.60580397, "memory(GiB)": 21.51, "elapsed_time": "6h 55m 14s", "remaining_time": "4h 41m 47s", "loss_scale": 1.0, "consumed_samples": 970240, "global_step/max_steps": "3790/6362"} +{"lm loss": 4.9177618, "grad_norm": 0.47262785, "learning_rate": 4.024e-05, "elapsed_time_per_iteration": 6.73492432, "memory(GiB)": 21.51, "elapsed_time": "6h 55m 21s", "remaining_time": "4h 41m 41s", "loss_scale": 1.0, "consumed_samples": 970496, "global_step/max_steps": "3791/6362"} +{"lm loss": 4.90970516, "grad_norm": 0.49642217, "learning_rate": 4.021e-05, "elapsed_time_per_iteration": 6.63324285, "memory(GiB)": 21.51, "elapsed_time": "6h 55m 28s", "remaining_time": "4h 41m 34s", "loss_scale": 1.0, "consumed_samples": 970752, "global_step/max_steps": "3792/6362"} +{"lm loss": 4.9098525, "grad_norm": 0.46576864, "learning_rate": 4.019e-05, "elapsed_time_per_iteration": 6.47346926, "memory(GiB)": 21.51, "elapsed_time": "6h 55m 34s", "remaining_time": "4h 41m 28s", "loss_scale": 1.0, "consumed_samples": 971008, "global_step/max_steps": "3793/6362"} +{"lm loss": 4.90550089, "grad_norm": 0.46549988, "learning_rate": 4.016e-05, "elapsed_time_per_iteration": 6.60177875, "memory(GiB)": 21.51, "elapsed_time": "6h 55m 41s", "remaining_time": "4h 41m 21s", "loss_scale": 1.0, "consumed_samples": 971264, "global_step/max_steps": "3794/6362"} +{"lm loss": 4.9065609, "grad_norm": 0.42522216, "learning_rate": 4.014e-05, "elapsed_time_per_iteration": 6.4316287, "memory(GiB)": 21.51, "elapsed_time": "6h 55m 47s", "remaining_time": "4h 41m 15s", "loss_scale": 1.0, "consumed_samples": 971520, "global_step/max_steps": "3795/6362"} +{"lm loss": 4.90553093, "grad_norm": 0.44368157, "learning_rate": 4.011e-05, "elapsed_time_per_iteration": 6.48030853, "memory(GiB)": 21.51, "elapsed_time": "6h 55m 54s", "remaining_time": "4h 41m 8s", "loss_scale": 1.0, "consumed_samples": 971776, "global_step/max_steps": "3796/6362"} +{"lm loss": 4.88986301, "grad_norm": 0.45181364, "learning_rate": 4.009e-05, "elapsed_time_per_iteration": 6.44464231, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 0s", "remaining_time": "4h 41m 1s", "loss_scale": 1.0, "consumed_samples": 972032, "global_step/max_steps": "3797/6362"} +{"lm loss": 4.88629818, "grad_norm": 0.46011019, "learning_rate": 4.006e-05, "elapsed_time_per_iteration": 6.5374999, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 7s", "remaining_time": "4h 40m 55s", "loss_scale": 1.0, "consumed_samples": 972288, "global_step/max_steps": "3798/6362"} +{"lm loss": 4.89769506, "grad_norm": 0.42876506, "learning_rate": 4.004e-05, "elapsed_time_per_iteration": 6.6583879, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 13s", "remaining_time": "4h 40m 48s", "loss_scale": 1.0, "consumed_samples": 972544, "global_step/max_steps": "3799/6362"} +{"lm loss": 4.92679501, "grad_norm": 0.39929828, "learning_rate": 4.001e-05, "elapsed_time_per_iteration": 6.66475844, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 20s", "remaining_time": "4h 40m 42s", "loss_scale": 1.0, "consumed_samples": 972800, "global_step/max_steps": "3800/6362"} +{"lm loss": 4.91585159, "grad_norm": 0.46785426, "learning_rate": 3.999e-05, "elapsed_time_per_iteration": 6.71362567, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 27s", "remaining_time": "4h 40m 35s", "loss_scale": 1.0, "consumed_samples": 973056, "global_step/max_steps": "3801/6362"} +{"lm loss": 4.91270304, "grad_norm": 0.49068022, "learning_rate": 3.997e-05, "elapsed_time_per_iteration": 6.24591637, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 33s", "remaining_time": "4h 40m 28s", "loss_scale": 1.0, "consumed_samples": 973312, "global_step/max_steps": "3802/6362"} +{"lm loss": 4.90007353, "grad_norm": 0.4777523, "learning_rate": 3.994e-05, "elapsed_time_per_iteration": 6.57077909, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 40s", "remaining_time": "4h 40m 22s", "loss_scale": 1.0, "consumed_samples": 973568, "global_step/max_steps": "3803/6362"} +{"lm loss": 4.90676403, "grad_norm": 0.45222071, "learning_rate": 3.992e-05, "elapsed_time_per_iteration": 6.47031307, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 46s", "remaining_time": "4h 40m 15s", "loss_scale": 1.0, "consumed_samples": 973824, "global_step/max_steps": "3804/6362"} +{"lm loss": 4.91010427, "grad_norm": 0.41229686, "learning_rate": 3.989e-05, "elapsed_time_per_iteration": 6.69858479, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 53s", "remaining_time": "4h 40m 9s", "loss_scale": 1.0, "consumed_samples": 974080, "global_step/max_steps": "3805/6362"} +{"lm loss": 4.91200495, "grad_norm": 0.46454301, "learning_rate": 3.987e-05, "elapsed_time_per_iteration": 6.54092145, "memory(GiB)": 21.51, "elapsed_time": "6h 56m 59s", "remaining_time": "4h 40m 2s", "loss_scale": 1.0, "consumed_samples": 974336, "global_step/max_steps": "3806/6362"} +{"lm loss": 4.89398479, "grad_norm": 0.44990221, "learning_rate": 3.984e-05, "elapsed_time_per_iteration": 6.5006876, "memory(GiB)": 21.51, "elapsed_time": "6h 57m 6s", "remaining_time": "4h 39m 55s", "loss_scale": 1.0, "consumed_samples": 974592, "global_step/max_steps": "3807/6362"} +{"lm loss": 4.88616085, "grad_norm": 0.47834817, "learning_rate": 3.982e-05, "elapsed_time_per_iteration": 6.39966774, "memory(GiB)": 21.51, "elapsed_time": "6h 57m 12s", "remaining_time": "4h 39m 49s", "loss_scale": 1.0, "consumed_samples": 974848, "global_step/max_steps": "3808/6362"} +{"lm loss": 4.90928745, "grad_norm": 0.45158747, "learning_rate": 3.979e-05, "elapsed_time_per_iteration": 6.32846165, "memory(GiB)": 21.51, "elapsed_time": "6h 57m 19s", "remaining_time": "4h 39m 42s", "loss_scale": 1.0, "consumed_samples": 975104, "global_step/max_steps": "3809/6362"} +{"lm loss": 4.91347027, "grad_norm": 0.45140702, "learning_rate": 3.977e-05, "elapsed_time_per_iteration": 6.5402, "memory(GiB)": 21.51, "elapsed_time": "6h 57m 25s", "remaining_time": "4h 39m 35s", "loss_scale": 1.0, "consumed_samples": 975360, "global_step/max_steps": "3810/6362"} +{"lm loss": 4.91535282, "grad_norm": 0.46015108, "learning_rate": 3.975e-05, "elapsed_time_per_iteration": 6.31932473, "memory(GiB)": 21.51, "elapsed_time": "6h 57m 31s", "remaining_time": "4h 39m 29s", "loss_scale": 1.0, "consumed_samples": 975616, "global_step/max_steps": "3811/6362"} +{"lm loss": 4.87994099, "grad_norm": 0.4375976, "learning_rate": 3.972e-05, "elapsed_time_per_iteration": 6.74730253, "memory(GiB)": 21.51, "elapsed_time": "6h 57m 38s", "remaining_time": "4h 39m 22s", "loss_scale": 1.0, "consumed_samples": 975872, "global_step/max_steps": "3812/6362"} +{"lm loss": 4.92141533, "grad_norm": 0.41291574, "learning_rate": 3.97e-05, "elapsed_time_per_iteration": 6.61909676, "memory(GiB)": 21.51, "elapsed_time": "6h 57m 45s", "remaining_time": "4h 39m 16s", "loss_scale": 1.0, "consumed_samples": 976128, "global_step/max_steps": "3813/6362"} +{"lm loss": 4.90121126, "grad_norm": 0.44239968, "learning_rate": 3.967e-05, "elapsed_time_per_iteration": 6.57740498, "memory(GiB)": 21.51, "elapsed_time": "6h 57m 51s", "remaining_time": "4h 39m 9s", "loss_scale": 1.0, "consumed_samples": 976384, "global_step/max_steps": "3814/6362"} +{"lm loss": 4.87937021, "grad_norm": 0.42423007, "learning_rate": 3.965e-05, "elapsed_time_per_iteration": 6.37801909, "memory(GiB)": 21.51, "elapsed_time": "6h 57m 58s", "remaining_time": "4h 39m 2s", "loss_scale": 1.0, "consumed_samples": 976640, "global_step/max_steps": "3815/6362"} +{"lm loss": 4.92921114, "grad_norm": 0.47499099, "learning_rate": 3.962e-05, "elapsed_time_per_iteration": 6.53265119, "memory(GiB)": 21.51, "elapsed_time": "6h 58m 4s", "remaining_time": "4h 38m 56s", "loss_scale": 1.0, "consumed_samples": 976896, "global_step/max_steps": "3816/6362"} +{"lm loss": 4.90799236, "grad_norm": 0.4333775, "learning_rate": 3.96e-05, "elapsed_time_per_iteration": 6.57056952, "memory(GiB)": 21.51, "elapsed_time": "6h 58m 11s", "remaining_time": "4h 38m 49s", "loss_scale": 1.0, "consumed_samples": 977152, "global_step/max_steps": "3817/6362"} +{"lm loss": 4.89841843, "grad_norm": 0.44425446, "learning_rate": 3.957e-05, "elapsed_time_per_iteration": 6.51158547, "memory(GiB)": 21.51, "elapsed_time": "6h 58m 17s", "remaining_time": "4h 38m 43s", "loss_scale": 1.0, "consumed_samples": 977408, "global_step/max_steps": "3818/6362"} +{"lm loss": 4.92628765, "grad_norm": 0.4019419, "learning_rate": 3.955e-05, "elapsed_time_per_iteration": 6.74822307, "memory(GiB)": 21.51, "elapsed_time": "6h 58m 24s", "remaining_time": "4h 38m 36s", "loss_scale": 1.0, "consumed_samples": 977664, "global_step/max_steps": "3819/6362"} +{"lm loss": 4.91428518, "grad_norm": 0.44312409, "learning_rate": 3.953e-05, "elapsed_time_per_iteration": 6.57027936, "memory(GiB)": 21.51, "elapsed_time": "6h 58m 31s", "remaining_time": "4h 38m 30s", "loss_scale": 1.0, "consumed_samples": 977920, "global_step/max_steps": "3820/6362"} +{"lm loss": 4.91686821, "grad_norm": 0.41758561, "learning_rate": 3.95e-05, "elapsed_time_per_iteration": 6.4463377, "memory(GiB)": 21.51, "elapsed_time": "6h 58m 37s", "remaining_time": "4h 38m 23s", "loss_scale": 1.0, "consumed_samples": 978176, "global_step/max_steps": "3821/6362"} +{"lm loss": 4.88432503, "grad_norm": 0.43194956, "learning_rate": 3.948e-05, "elapsed_time_per_iteration": 6.51781774, "memory(GiB)": 21.51, "elapsed_time": "6h 58m 44s", "remaining_time": "4h 38m 16s", "loss_scale": 1.0, "consumed_samples": 978432, "global_step/max_steps": "3822/6362"} +{"lm loss": 4.8930912, "grad_norm": 0.46997193, "learning_rate": 3.945e-05, "elapsed_time_per_iteration": 6.59431529, "memory(GiB)": 21.51, "elapsed_time": "6h 58m 50s", "remaining_time": "4h 38m 10s", "loss_scale": 1.0, "consumed_samples": 978688, "global_step/max_steps": "3823/6362"} +{"lm loss": 4.89652061, "grad_norm": 0.41474181, "learning_rate": 3.943e-05, "elapsed_time_per_iteration": 6.53787565, "memory(GiB)": 21.51, "elapsed_time": "6h 58m 57s", "remaining_time": "4h 38m 3s", "loss_scale": 1.0, "consumed_samples": 978944, "global_step/max_steps": "3824/6362"} +{"lm loss": 4.88083315, "grad_norm": 0.43690079, "learning_rate": 3.94e-05, "elapsed_time_per_iteration": 6.48490429, "memory(GiB)": 21.51, "elapsed_time": "6h 59m 3s", "remaining_time": "4h 37m 57s", "loss_scale": 1.0, "consumed_samples": 979200, "global_step/max_steps": "3825/6362"} +{"lm loss": 4.91142893, "grad_norm": 0.41244677, "learning_rate": 3.938e-05, "elapsed_time_per_iteration": 6.36519575, "memory(GiB)": 21.51, "elapsed_time": "6h 59m 10s", "remaining_time": "4h 37m 50s", "loss_scale": 1.0, "consumed_samples": 979456, "global_step/max_steps": "3826/6362"} +{"lm loss": 4.91399288, "grad_norm": 0.52549887, "learning_rate": 3.935e-05, "elapsed_time_per_iteration": 6.57065082, "memory(GiB)": 21.51, "elapsed_time": "6h 59m 16s", "remaining_time": "4h 37m 43s", "loss_scale": 1.0, "consumed_samples": 979712, "global_step/max_steps": "3827/6362"} +{"lm loss": 4.88505125, "grad_norm": 0.50504386, "learning_rate": 3.933e-05, "elapsed_time_per_iteration": 6.67981839, "memory(GiB)": 21.51, "elapsed_time": "6h 59m 23s", "remaining_time": "4h 37m 37s", "loss_scale": 1.0, "consumed_samples": 979968, "global_step/max_steps": "3828/6362"} +{"lm loss": 4.89039564, "grad_norm": 0.41529328, "learning_rate": 3.931e-05, "elapsed_time_per_iteration": 6.63656354, "memory(GiB)": 21.51, "elapsed_time": "6h 59m 30s", "remaining_time": "4h 37m 30s", "loss_scale": 1.0, "consumed_samples": 980224, "global_step/max_steps": "3829/6362"} +{"lm loss": 4.88748598, "grad_norm": 0.44217619, "learning_rate": 3.928e-05, "elapsed_time_per_iteration": 6.71278667, "memory(GiB)": 21.51, "elapsed_time": "6h 59m 36s", "remaining_time": "4h 37m 24s", "loss_scale": 1.0, "consumed_samples": 980480, "global_step/max_steps": "3830/6362"} +{"lm loss": 4.93456221, "grad_norm": 0.47185814, "learning_rate": 3.926e-05, "elapsed_time_per_iteration": 6.69403505, "memory(GiB)": 21.51, "elapsed_time": "6h 59m 43s", "remaining_time": "4h 37m 17s", "loss_scale": 1.0, "consumed_samples": 980736, "global_step/max_steps": "3831/6362"} +{"lm loss": 4.89528179, "grad_norm": 0.48111996, "learning_rate": 3.923e-05, "elapsed_time_per_iteration": 6.47991538, "memory(GiB)": 21.51, "elapsed_time": "6h 59m 49s", "remaining_time": "4h 37m 11s", "loss_scale": 1.0, "consumed_samples": 980992, "global_step/max_steps": "3832/6362"} +{"lm loss": 4.89255142, "grad_norm": 0.46266401, "learning_rate": 3.921e-05, "elapsed_time_per_iteration": 6.68164039, "memory(GiB)": 21.51, "elapsed_time": "6h 59m 56s", "remaining_time": "4h 37m 4s", "loss_scale": 1.0, "consumed_samples": 981248, "global_step/max_steps": "3833/6362"} +{"lm loss": 4.89942694, "grad_norm": 0.45357078, "learning_rate": 3.918e-05, "elapsed_time_per_iteration": 6.35522318, "memory(GiB)": 21.51, "elapsed_time": "7h 0m 2s", "remaining_time": "4h 36m 57s", "loss_scale": 1.0, "consumed_samples": 981504, "global_step/max_steps": "3834/6362"} +{"lm loss": 4.89553118, "grad_norm": 0.43496192, "learning_rate": 3.916e-05, "elapsed_time_per_iteration": 6.53349352, "memory(GiB)": 21.51, "elapsed_time": "7h 0m 9s", "remaining_time": "4h 36m 51s", "loss_scale": 1.0, "consumed_samples": 981760, "global_step/max_steps": "3835/6362"} +{"lm loss": 4.90791178, "grad_norm": 0.50002646, "learning_rate": 3.914e-05, "elapsed_time_per_iteration": 6.34683156, "memory(GiB)": 21.51, "elapsed_time": "7h 0m 15s", "remaining_time": "4h 36m 44s", "loss_scale": 1.0, "consumed_samples": 982016, "global_step/max_steps": "3836/6362"} +{"lm loss": 4.88249683, "grad_norm": 0.50046265, "learning_rate": 3.911e-05, "elapsed_time_per_iteration": 6.59521341, "memory(GiB)": 21.51, "elapsed_time": "7h 0m 22s", "remaining_time": "4h 36m 38s", "loss_scale": 1.0, "consumed_samples": 982272, "global_step/max_steps": "3837/6362"} +{"lm loss": 4.9055872, "grad_norm": 0.45091188, "learning_rate": 3.909e-05, "elapsed_time_per_iteration": 6.37717462, "memory(GiB)": 21.51, "elapsed_time": "7h 0m 28s", "remaining_time": "4h 36m 31s", "loss_scale": 1.0, "consumed_samples": 982528, "global_step/max_steps": "3838/6362"} +{"lm loss": 4.88623333, "grad_norm": 0.50254649, "learning_rate": 3.906e-05, "elapsed_time_per_iteration": 6.67934847, "memory(GiB)": 21.51, "elapsed_time": "7h 0m 35s", "remaining_time": "4h 36m 24s", "loss_scale": 1.0, "consumed_samples": 982784, "global_step/max_steps": "3839/6362"} +{"lm loss": 4.92088461, "grad_norm": 0.4684312, "learning_rate": 3.904e-05, "elapsed_time_per_iteration": 6.39446068, "memory(GiB)": 21.51, "elapsed_time": "7h 0m 41s", "remaining_time": "4h 36m 18s", "loss_scale": 1.0, "consumed_samples": 983040, "global_step/max_steps": "3840/6362"} +{"lm loss": 4.92659712, "grad_norm": 0.47352439, "learning_rate": 3.901e-05, "elapsed_time_per_iteration": 6.30054569, "memory(GiB)": 21.51, "elapsed_time": "7h 0m 48s", "remaining_time": "4h 36m 11s", "loss_scale": 1.0, "consumed_samples": 983296, "global_step/max_steps": "3841/6362"} +{"lm loss": 4.90494108, "grad_norm": 0.42961267, "learning_rate": 3.899e-05, "elapsed_time_per_iteration": 6.41377354, "memory(GiB)": 21.51, "elapsed_time": "7h 0m 54s", "remaining_time": "4h 36m 4s", "loss_scale": 1.0, "consumed_samples": 983552, "global_step/max_steps": "3842/6362"} +{"lm loss": 4.89476109, "grad_norm": 0.48206446, "learning_rate": 3.896e-05, "elapsed_time_per_iteration": 6.43408608, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 1s", "remaining_time": "4h 35m 58s", "loss_scale": 1.0, "consumed_samples": 983808, "global_step/max_steps": "3843/6362"} +{"lm loss": 4.88282299, "grad_norm": 0.4998979, "learning_rate": 3.894e-05, "elapsed_time_per_iteration": 6.72048044, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 7s", "remaining_time": "4h 35m 51s", "loss_scale": 1.0, "consumed_samples": 984064, "global_step/max_steps": "3844/6362"} +{"lm loss": 4.91638327, "grad_norm": 0.41519678, "learning_rate": 3.892e-05, "elapsed_time_per_iteration": 6.57224321, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 14s", "remaining_time": "4h 35m 44s", "loss_scale": 1.0, "consumed_samples": 984320, "global_step/max_steps": "3845/6362"} +{"lm loss": 4.9138813, "grad_norm": 0.44616863, "learning_rate": 3.889e-05, "elapsed_time_per_iteration": 6.3548162, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 20s", "remaining_time": "4h 35m 38s", "loss_scale": 1.0, "consumed_samples": 984576, "global_step/max_steps": "3846/6362"} +{"lm loss": 4.88542366, "grad_norm": 0.44106179, "learning_rate": 3.887e-05, "elapsed_time_per_iteration": 6.49444985, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 27s", "remaining_time": "4h 35m 31s", "loss_scale": 1.0, "consumed_samples": 984832, "global_step/max_steps": "3847/6362"} +{"lm loss": 4.90730095, "grad_norm": 0.48276272, "learning_rate": 3.884e-05, "elapsed_time_per_iteration": 6.35423374, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 33s", "remaining_time": "4h 35m 24s", "loss_scale": 1.0, "consumed_samples": 985088, "global_step/max_steps": "3848/6362"} +{"lm loss": 4.9019146, "grad_norm": 0.42033145, "learning_rate": 3.882e-05, "elapsed_time_per_iteration": 6.3143611, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 39s", "remaining_time": "4h 35m 18s", "loss_scale": 1.0, "consumed_samples": 985344, "global_step/max_steps": "3849/6362"} +{"lm loss": 4.91183472, "grad_norm": 0.44887558, "learning_rate": 3.879e-05, "elapsed_time_per_iteration": 6.43456531, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 46s", "remaining_time": "4h 35m 11s", "loss_scale": 1.0, "consumed_samples": 985600, "global_step/max_steps": "3850/6362"} +{"lm loss": 4.8842802, "grad_norm": 0.42820442, "learning_rate": 3.877e-05, "elapsed_time_per_iteration": 6.75395823, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 53s", "remaining_time": "4h 35m 5s", "loss_scale": 1.0, "consumed_samples": 985856, "global_step/max_steps": "3851/6362"} +{"lm loss": 4.90813398, "grad_norm": 0.4236123, "learning_rate": 3.875e-05, "elapsed_time_per_iteration": 6.71538591, "memory(GiB)": 21.51, "elapsed_time": "7h 1m 59s", "remaining_time": "4h 34m 58s", "loss_scale": 1.0, "consumed_samples": 986112, "global_step/max_steps": "3852/6362"} +{"lm loss": 4.87829065, "grad_norm": 0.44518042, "learning_rate": 3.872e-05, "elapsed_time_per_iteration": 6.85339618, "memory(GiB)": 21.51, "elapsed_time": "7h 2m 6s", "remaining_time": "4h 34m 52s", "loss_scale": 1.0, "consumed_samples": 986368, "global_step/max_steps": "3853/6362"} +{"lm loss": 4.92753124, "grad_norm": 0.42019919, "learning_rate": 3.87e-05, "elapsed_time_per_iteration": 6.69105721, "memory(GiB)": 21.51, "elapsed_time": "7h 2m 13s", "remaining_time": "4h 34m 45s", "loss_scale": 1.0, "consumed_samples": 986624, "global_step/max_steps": "3854/6362"} +{"lm loss": 4.90736961, "grad_norm": 0.38467541, "learning_rate": 3.867e-05, "elapsed_time_per_iteration": 6.53645301, "memory(GiB)": 21.51, "elapsed_time": "7h 2m 19s", "remaining_time": "4h 34m 39s", "loss_scale": 1.0, "consumed_samples": 986880, "global_step/max_steps": "3855/6362"} +{"lm loss": 4.92949438, "grad_norm": 0.42072681, "learning_rate": 3.865e-05, "elapsed_time_per_iteration": 6.7165134, "memory(GiB)": 21.51, "elapsed_time": "7h 2m 26s", "remaining_time": "4h 34m 32s", "loss_scale": 1.0, "consumed_samples": 987136, "global_step/max_steps": "3856/6362"} +{"lm loss": 4.90498877, "grad_norm": 0.4240686, "learning_rate": 3.862e-05, "elapsed_time_per_iteration": 6.7319622, "memory(GiB)": 21.51, "elapsed_time": "7h 2m 33s", "remaining_time": "4h 34m 26s", "loss_scale": 1.0, "consumed_samples": 987392, "global_step/max_steps": "3857/6362"} +{"lm loss": 4.91895723, "grad_norm": 0.4515838, "learning_rate": 3.86e-05, "elapsed_time_per_iteration": 6.67650366, "memory(GiB)": 21.51, "elapsed_time": "7h 2m 39s", "remaining_time": "4h 34m 19s", "loss_scale": 1.0, "consumed_samples": 987648, "global_step/max_steps": "3858/6362"} +{"lm loss": 4.91667128, "grad_norm": 0.58614534, "learning_rate": 3.858e-05, "elapsed_time_per_iteration": 6.41056728, "memory(GiB)": 21.51, "elapsed_time": "7h 2m 46s", "remaining_time": "4h 34m 12s", "loss_scale": 1.0, "consumed_samples": 987904, "global_step/max_steps": "3859/6362"} +{"lm loss": 4.91418886, "grad_norm": 0.47371244, "learning_rate": 3.855e-05, "elapsed_time_per_iteration": 6.37692571, "memory(GiB)": 21.51, "elapsed_time": "7h 2m 52s", "remaining_time": "4h 34m 6s", "loss_scale": 1.0, "consumed_samples": 988160, "global_step/max_steps": "3860/6362"} +{"lm loss": 4.91651058, "grad_norm": 0.41894972, "learning_rate": 3.853e-05, "elapsed_time_per_iteration": 6.44093919, "memory(GiB)": 21.51, "elapsed_time": "7h 2m 59s", "remaining_time": "4h 33m 59s", "loss_scale": 1.0, "consumed_samples": 988416, "global_step/max_steps": "3861/6362"} +{"lm loss": 4.89729881, "grad_norm": 0.45822495, "learning_rate": 3.85e-05, "elapsed_time_per_iteration": 6.60869861, "memory(GiB)": 21.51, "elapsed_time": "7h 3m 5s", "remaining_time": "4h 33m 53s", "loss_scale": 1.0, "consumed_samples": 988672, "global_step/max_steps": "3862/6362"} +{"lm loss": 4.90569353, "grad_norm": 0.40391636, "learning_rate": 3.848e-05, "elapsed_time_per_iteration": 6.75514388, "memory(GiB)": 21.51, "elapsed_time": "7h 3m 12s", "remaining_time": "4h 33m 46s", "loss_scale": 1.0, "consumed_samples": 988928, "global_step/max_steps": "3863/6362"} +{"lm loss": 4.89651394, "grad_norm": 0.3873682, "learning_rate": 3.845e-05, "elapsed_time_per_iteration": 6.49036574, "memory(GiB)": 21.51, "elapsed_time": "7h 3m 19s", "remaining_time": "4h 33m 39s", "loss_scale": 1.0, "consumed_samples": 989184, "global_step/max_steps": "3864/6362"} +{"lm loss": 4.90988064, "grad_norm": 0.46486026, "learning_rate": 3.843e-05, "elapsed_time_per_iteration": 6.40270591, "memory(GiB)": 21.51, "elapsed_time": "7h 3m 25s", "remaining_time": "4h 33m 33s", "loss_scale": 1.0, "consumed_samples": 989440, "global_step/max_steps": "3865/6362"} +{"lm loss": 4.88220072, "grad_norm": 0.45801026, "learning_rate": 3.841e-05, "elapsed_time_per_iteration": 6.53935242, "memory(GiB)": 21.51, "elapsed_time": "7h 3m 31s", "remaining_time": "4h 33m 26s", "loss_scale": 1.0, "consumed_samples": 989696, "global_step/max_steps": "3866/6362"} +{"lm loss": 4.90278912, "grad_norm": 0.47336605, "learning_rate": 3.838e-05, "elapsed_time_per_iteration": 6.6086483, "memory(GiB)": 21.51, "elapsed_time": "7h 3m 38s", "remaining_time": "4h 33m 20s", "loss_scale": 1.0, "consumed_samples": 989952, "global_step/max_steps": "3867/6362"} +{"lm loss": 4.9280715, "grad_norm": 0.48260313, "learning_rate": 3.836e-05, "elapsed_time_per_iteration": 6.66190886, "memory(GiB)": 21.51, "elapsed_time": "7h 3m 45s", "remaining_time": "4h 33m 13s", "loss_scale": 1.0, "consumed_samples": 990208, "global_step/max_steps": "3868/6362"} +{"lm loss": 4.89396572, "grad_norm": 0.46095744, "learning_rate": 3.833e-05, "elapsed_time_per_iteration": 6.9433825, "memory(GiB)": 21.51, "elapsed_time": "7h 3m 52s", "remaining_time": "4h 33m 7s", "loss_scale": 1.0, "consumed_samples": 990464, "global_step/max_steps": "3869/6362"} +{"lm loss": 4.86391783, "grad_norm": 0.46628219, "learning_rate": 3.831e-05, "elapsed_time_per_iteration": 6.58625197, "memory(GiB)": 21.51, "elapsed_time": "7h 3m 58s", "remaining_time": "4h 33m 0s", "loss_scale": 1.0, "consumed_samples": 990720, "global_step/max_steps": "3870/6362"} +{"lm loss": 4.93924856, "grad_norm": 0.45596039, "learning_rate": 3.828e-05, "elapsed_time_per_iteration": 6.49158597, "memory(GiB)": 21.51, "elapsed_time": "7h 4m 5s", "remaining_time": "4h 32m 54s", "loss_scale": 1.0, "consumed_samples": 990976, "global_step/max_steps": "3871/6362"} +{"lm loss": 4.90396595, "grad_norm": 0.4592346, "learning_rate": 3.826e-05, "elapsed_time_per_iteration": 6.714293, "memory(GiB)": 21.51, "elapsed_time": "7h 4m 11s", "remaining_time": "4h 32m 47s", "loss_scale": 1.0, "consumed_samples": 991232, "global_step/max_steps": "3872/6362"} +{"lm loss": 4.90724945, "grad_norm": 0.45530063, "learning_rate": 3.824e-05, "elapsed_time_per_iteration": 6.6435132, "memory(GiB)": 21.51, "elapsed_time": "7h 4m 18s", "remaining_time": "4h 32m 41s", "loss_scale": 1.0, "consumed_samples": 991488, "global_step/max_steps": "3873/6362"} +{"lm loss": 4.90351248, "grad_norm": 0.4786188, "learning_rate": 3.821e-05, "elapsed_time_per_iteration": 6.54883456, "memory(GiB)": 21.51, "elapsed_time": "7h 4m 25s", "remaining_time": "4h 32m 34s", "loss_scale": 1.0, "consumed_samples": 991744, "global_step/max_steps": "3874/6362"} +{"lm loss": 4.9197998, "grad_norm": 0.50821579, "learning_rate": 3.819e-05, "elapsed_time_per_iteration": 6.63113284, "memory(GiB)": 21.51, "elapsed_time": "7h 4m 31s", "remaining_time": "4h 32m 27s", "loss_scale": 1.0, "consumed_samples": 992000, "global_step/max_steps": "3875/6362"} +{"lm loss": 4.88338137, "grad_norm": 0.42695579, "learning_rate": 3.816e-05, "elapsed_time_per_iteration": 6.51531267, "memory(GiB)": 21.51, "elapsed_time": "7h 4m 38s", "remaining_time": "4h 32m 21s", "loss_scale": 1.0, "consumed_samples": 992256, "global_step/max_steps": "3876/6362"} +{"lm loss": 4.90662766, "grad_norm": 0.47658038, "learning_rate": 3.814e-05, "elapsed_time_per_iteration": 6.70011449, "memory(GiB)": 21.51, "elapsed_time": "7h 4m 45s", "remaining_time": "4h 32m 14s", "loss_scale": 1.0, "consumed_samples": 992512, "global_step/max_steps": "3877/6362"} +{"lm loss": 4.89394665, "grad_norm": 0.45651123, "learning_rate": 3.811e-05, "elapsed_time_per_iteration": 6.43224072, "memory(GiB)": 21.51, "elapsed_time": "7h 4m 51s", "remaining_time": "4h 32m 8s", "loss_scale": 1.0, "consumed_samples": 992768, "global_step/max_steps": "3878/6362"} +{"lm loss": 4.91720629, "grad_norm": 0.43083185, "learning_rate": 3.809e-05, "elapsed_time_per_iteration": 6.53216195, "memory(GiB)": 21.51, "elapsed_time": "7h 4m 57s", "remaining_time": "4h 32m 1s", "loss_scale": 1.0, "consumed_samples": 993024, "global_step/max_steps": "3879/6362"} +{"lm loss": 4.89597034, "grad_norm": 0.469915, "learning_rate": 3.807e-05, "elapsed_time_per_iteration": 6.41833735, "memory(GiB)": 21.51, "elapsed_time": "7h 5m 4s", "remaining_time": "4h 31m 54s", "loss_scale": 1.0, "consumed_samples": 993280, "global_step/max_steps": "3880/6362"} +{"lm loss": 4.88944054, "grad_norm": 0.38521674, "learning_rate": 3.804e-05, "elapsed_time_per_iteration": 6.44047141, "memory(GiB)": 21.51, "elapsed_time": "7h 5m 10s", "remaining_time": "4h 31m 48s", "loss_scale": 1.0, "consumed_samples": 993536, "global_step/max_steps": "3881/6362"} +{"lm loss": 4.88388729, "grad_norm": 0.46449739, "learning_rate": 3.802e-05, "elapsed_time_per_iteration": 6.62140226, "memory(GiB)": 21.51, "elapsed_time": "7h 5m 17s", "remaining_time": "4h 31m 41s", "loss_scale": 1.0, "consumed_samples": 993792, "global_step/max_steps": "3882/6362"} +{"lm loss": 4.87324572, "grad_norm": 0.40572393, "learning_rate": 3.799e-05, "elapsed_time_per_iteration": 6.61594296, "memory(GiB)": 21.51, "elapsed_time": "7h 5m 24s", "remaining_time": "4h 31m 35s", "loss_scale": 1.0, "consumed_samples": 994048, "global_step/max_steps": "3883/6362"} +{"lm loss": 4.89595366, "grad_norm": 0.41106465, "learning_rate": 3.797e-05, "elapsed_time_per_iteration": 6.51519346, "memory(GiB)": 21.51, "elapsed_time": "7h 5m 30s", "remaining_time": "4h 31m 28s", "loss_scale": 1.0, "consumed_samples": 994304, "global_step/max_steps": "3884/6362"} +{"lm loss": 4.91157103, "grad_norm": 0.38450137, "learning_rate": 3.794e-05, "elapsed_time_per_iteration": 6.57975435, "memory(GiB)": 21.51, "elapsed_time": "7h 5m 37s", "remaining_time": "4h 31m 21s", "loss_scale": 1.0, "consumed_samples": 994560, "global_step/max_steps": "3885/6362"} +{"lm loss": 4.91186523, "grad_norm": 0.40605038, "learning_rate": 3.792e-05, "elapsed_time_per_iteration": 6.7846787, "memory(GiB)": 21.51, "elapsed_time": "7h 5m 43s", "remaining_time": "4h 31m 15s", "loss_scale": 1.0, "consumed_samples": 994816, "global_step/max_steps": "3886/6362"} +{"lm loss": 4.87479973, "grad_norm": 0.4327741, "learning_rate": 3.79e-05, "elapsed_time_per_iteration": 6.69559336, "memory(GiB)": 21.51, "elapsed_time": "7h 5m 50s", "remaining_time": "4h 31m 9s", "loss_scale": 1.0, "consumed_samples": 995072, "global_step/max_steps": "3887/6362"} +{"lm loss": 4.88068199, "grad_norm": 0.4226253, "learning_rate": 3.787e-05, "elapsed_time_per_iteration": 6.54571223, "memory(GiB)": 21.51, "elapsed_time": "7h 5m 57s", "remaining_time": "4h 31m 2s", "loss_scale": 1.0, "consumed_samples": 995328, "global_step/max_steps": "3888/6362"} +{"lm loss": 4.88992119, "grad_norm": 0.43934035, "learning_rate": 3.785e-05, "elapsed_time_per_iteration": 6.48429179, "memory(GiB)": 21.51, "elapsed_time": "7h 6m 3s", "remaining_time": "4h 30m 55s", "loss_scale": 1.0, "consumed_samples": 995584, "global_step/max_steps": "3889/6362"} +{"lm loss": 4.9169426, "grad_norm": 0.41934583, "learning_rate": 3.782e-05, "elapsed_time_per_iteration": 6.47188616, "memory(GiB)": 21.51, "elapsed_time": "7h 6m 10s", "remaining_time": "4h 30m 49s", "loss_scale": 1.0, "consumed_samples": 995840, "global_step/max_steps": "3890/6362"} +{"lm loss": 4.89015579, "grad_norm": 0.44877878, "learning_rate": 3.78e-05, "elapsed_time_per_iteration": 6.46606255, "memory(GiB)": 21.51, "elapsed_time": "7h 6m 16s", "remaining_time": "4h 30m 42s", "loss_scale": 1.0, "consumed_samples": 996096, "global_step/max_steps": "3891/6362"} +{"lm loss": 4.9013567, "grad_norm": 0.38861558, "learning_rate": 3.778e-05, "elapsed_time_per_iteration": 6.33604693, "memory(GiB)": 21.51, "elapsed_time": "7h 6m 22s", "remaining_time": "4h 30m 35s", "loss_scale": 1.0, "consumed_samples": 996352, "global_step/max_steps": "3892/6362"} +{"lm loss": 4.92617607, "grad_norm": 0.42814174, "learning_rate": 3.775e-05, "elapsed_time_per_iteration": 6.66195178, "memory(GiB)": 21.51, "elapsed_time": "7h 6m 29s", "remaining_time": "4h 30m 29s", "loss_scale": 1.0, "consumed_samples": 996608, "global_step/max_steps": "3893/6362"} +{"lm loss": 4.89082384, "grad_norm": 0.46362895, "learning_rate": 3.773e-05, "elapsed_time_per_iteration": 6.47236323, "memory(GiB)": 21.51, "elapsed_time": "7h 6m 36s", "remaining_time": "4h 30m 22s", "loss_scale": 1.0, "consumed_samples": 996864, "global_step/max_steps": "3894/6362"} +{"lm loss": 4.93347454, "grad_norm": 0.41061923, "learning_rate": 3.77e-05, "elapsed_time_per_iteration": 6.54335713, "memory(GiB)": 21.51, "elapsed_time": "7h 6m 42s", "remaining_time": "4h 30m 16s", "loss_scale": 1.0, "consumed_samples": 997120, "global_step/max_steps": "3895/6362"} +{"lm loss": 4.8856864, "grad_norm": 0.43820021, "learning_rate": 3.768e-05, "elapsed_time_per_iteration": 6.6997993, "memory(GiB)": 21.51, "elapsed_time": "7h 6m 49s", "remaining_time": "4h 30m 9s", "loss_scale": 1.0, "consumed_samples": 997376, "global_step/max_steps": "3896/6362"} +{"lm loss": 4.91468525, "grad_norm": 0.39752927, "learning_rate": 3.765e-05, "elapsed_time_per_iteration": 6.59898686, "memory(GiB)": 21.51, "elapsed_time": "7h 6m 55s", "remaining_time": "4h 30m 3s", "loss_scale": 1.0, "consumed_samples": 997632, "global_step/max_steps": "3897/6362"} +{"lm loss": 4.88125801, "grad_norm": 0.41959378, "learning_rate": 3.763e-05, "elapsed_time_per_iteration": 6.47546077, "memory(GiB)": 21.51, "elapsed_time": "7h 7m 2s", "remaining_time": "4h 29m 56s", "loss_scale": 1.0, "consumed_samples": 997888, "global_step/max_steps": "3898/6362"} +{"lm loss": 4.88557816, "grad_norm": 0.40128839, "learning_rate": 3.761e-05, "elapsed_time_per_iteration": 6.76194191, "memory(GiB)": 21.51, "elapsed_time": "7h 7m 9s", "remaining_time": "4h 29m 49s", "loss_scale": 1.0, "consumed_samples": 998144, "global_step/max_steps": "3899/6362"} +{"lm loss": 4.89045334, "grad_norm": 0.43689123, "learning_rate": 3.758e-05, "elapsed_time_per_iteration": 6.92033577, "memory(GiB)": 21.51, "elapsed_time": "7h 7m 16s", "remaining_time": "4h 29m 43s", "loss_scale": 1.0, "consumed_samples": 998400, "global_step/max_steps": "3900/6362"} +{"lm loss": 4.9183135, "grad_norm": 0.41329286, "learning_rate": 3.756e-05, "elapsed_time_per_iteration": 6.79191232, "memory(GiB)": 21.51, "elapsed_time": "7h 7m 22s", "remaining_time": "4h 29m 37s", "loss_scale": 1.0, "consumed_samples": 998656, "global_step/max_steps": "3901/6362"} +{"lm loss": 4.90637922, "grad_norm": 0.42725068, "learning_rate": 3.753e-05, "elapsed_time_per_iteration": 6.60720086, "memory(GiB)": 21.51, "elapsed_time": "7h 7m 29s", "remaining_time": "4h 29m 30s", "loss_scale": 1.0, "consumed_samples": 998912, "global_step/max_steps": "3902/6362"} +{"lm loss": 4.90050316, "grad_norm": 0.41898555, "learning_rate": 3.751e-05, "elapsed_time_per_iteration": 6.62702823, "memory(GiB)": 21.51, "elapsed_time": "7h 7m 36s", "remaining_time": "4h 29m 24s", "loss_scale": 1.0, "consumed_samples": 999168, "global_step/max_steps": "3903/6362"} +{"lm loss": 4.90726662, "grad_norm": 0.40697813, "learning_rate": 3.749e-05, "elapsed_time_per_iteration": 6.59847212, "memory(GiB)": 21.51, "elapsed_time": "7h 7m 42s", "remaining_time": "4h 29m 17s", "loss_scale": 1.0, "consumed_samples": 999424, "global_step/max_steps": "3904/6362"} +{"lm loss": 4.91182947, "grad_norm": 0.42762479, "learning_rate": 3.746e-05, "elapsed_time_per_iteration": 6.67144465, "memory(GiB)": 21.51, "elapsed_time": "7h 7m 49s", "remaining_time": "4h 29m 10s", "loss_scale": 1.0, "consumed_samples": 999680, "global_step/max_steps": "3905/6362"} +{"lm loss": 4.90937614, "grad_norm": 0.39619872, "learning_rate": 3.744e-05, "elapsed_time_per_iteration": 6.63378191, "memory(GiB)": 21.51, "elapsed_time": "7h 7m 56s", "remaining_time": "4h 29m 4s", "loss_scale": 1.0, "consumed_samples": 999936, "global_step/max_steps": "3906/6362"} +{"lm loss": 4.89810324, "grad_norm": 0.4583548, "learning_rate": 3.741e-05, "elapsed_time_per_iteration": 6.46110106, "memory(GiB)": 21.51, "elapsed_time": "7h 8m 2s", "remaining_time": "4h 28m 57s", "loss_scale": 1.0, "consumed_samples": 1000192, "global_step/max_steps": "3907/6362"} +{"lm loss": 4.90555763, "grad_norm": 0.39996257, "learning_rate": 3.739e-05, "elapsed_time_per_iteration": 6.8365643, "memory(GiB)": 21.51, "elapsed_time": "7h 8m 9s", "remaining_time": "4h 28m 51s", "loss_scale": 1.0, "consumed_samples": 1000448, "global_step/max_steps": "3908/6362"} +{"lm loss": 4.90917587, "grad_norm": 0.40826035, "learning_rate": 3.736e-05, "elapsed_time_per_iteration": 6.82459021, "memory(GiB)": 21.51, "elapsed_time": "7h 8m 16s", "remaining_time": "4h 28m 44s", "loss_scale": 1.0, "consumed_samples": 1000704, "global_step/max_steps": "3909/6362"} +{"lm loss": 4.90132618, "grad_norm": 0.4464224, "learning_rate": 3.734e-05, "elapsed_time_per_iteration": 6.88217235, "memory(GiB)": 21.51, "elapsed_time": "7h 8m 23s", "remaining_time": "4h 28m 38s", "loss_scale": 1.0, "consumed_samples": 1000960, "global_step/max_steps": "3910/6362"} +{"lm loss": 4.89640093, "grad_norm": 0.4188731, "learning_rate": 3.732e-05, "elapsed_time_per_iteration": 6.66277027, "memory(GiB)": 21.51, "elapsed_time": "7h 8m 29s", "remaining_time": "4h 28m 32s", "loss_scale": 1.0, "consumed_samples": 1001216, "global_step/max_steps": "3911/6362"} +{"lm loss": 4.90051317, "grad_norm": 0.41553032, "learning_rate": 3.729e-05, "elapsed_time_per_iteration": 6.58250117, "memory(GiB)": 21.51, "elapsed_time": "7h 8m 36s", "remaining_time": "4h 28m 25s", "loss_scale": 1.0, "consumed_samples": 1001472, "global_step/max_steps": "3912/6362"} +{"lm loss": 4.89698601, "grad_norm": 0.40140054, "learning_rate": 3.727e-05, "elapsed_time_per_iteration": 6.77977371, "memory(GiB)": 21.51, "elapsed_time": "7h 8m 43s", "remaining_time": "4h 28m 19s", "loss_scale": 1.0, "consumed_samples": 1001728, "global_step/max_steps": "3913/6362"} +{"lm loss": 4.90240908, "grad_norm": 0.4675045, "learning_rate": 3.724e-05, "elapsed_time_per_iteration": 6.73659801, "memory(GiB)": 21.51, "elapsed_time": "7h 8m 49s", "remaining_time": "4h 28m 12s", "loss_scale": 1.0, "consumed_samples": 1001984, "global_step/max_steps": "3914/6362"} +{"lm loss": 4.91751146, "grad_norm": 0.43377924, "learning_rate": 3.722e-05, "elapsed_time_per_iteration": 6.77877903, "memory(GiB)": 21.51, "elapsed_time": "7h 8m 56s", "remaining_time": "4h 28m 6s", "loss_scale": 1.0, "consumed_samples": 1002240, "global_step/max_steps": "3915/6362"} +{"lm loss": 4.91899204, "grad_norm": 0.41096866, "learning_rate": 3.72e-05, "elapsed_time_per_iteration": 6.52656937, "memory(GiB)": 21.51, "elapsed_time": "7h 9m 3s", "remaining_time": "4h 27m 59s", "loss_scale": 1.0, "consumed_samples": 1002496, "global_step/max_steps": "3916/6362"} +{"lm loss": 4.90450478, "grad_norm": 0.46071535, "learning_rate": 3.717e-05, "elapsed_time_per_iteration": 6.56741762, "memory(GiB)": 21.51, "elapsed_time": "7h 9m 9s", "remaining_time": "4h 27m 52s", "loss_scale": 1.0, "consumed_samples": 1002752, "global_step/max_steps": "3917/6362"} +{"lm loss": 4.8767662, "grad_norm": 0.43898246, "learning_rate": 3.715e-05, "elapsed_time_per_iteration": 6.57398748, "memory(GiB)": 21.51, "elapsed_time": "7h 9m 16s", "remaining_time": "4h 27m 46s", "loss_scale": 1.0, "consumed_samples": 1003008, "global_step/max_steps": "3918/6362"} +{"lm loss": 4.91486931, "grad_norm": 0.49826121, "learning_rate": 3.712e-05, "elapsed_time_per_iteration": 6.54510522, "memory(GiB)": 21.51, "elapsed_time": "7h 9m 22s", "remaining_time": "4h 27m 39s", "loss_scale": 1.0, "consumed_samples": 1003264, "global_step/max_steps": "3919/6362"} +{"lm loss": 4.88668966, "grad_norm": 0.43823394, "learning_rate": 3.71e-05, "elapsed_time_per_iteration": 6.4851892, "memory(GiB)": 21.51, "elapsed_time": "7h 9m 29s", "remaining_time": "4h 27m 33s", "loss_scale": 1.0, "consumed_samples": 1003520, "global_step/max_steps": "3920/6362"} +{"lm loss": 4.88215685, "grad_norm": 0.51086807, "learning_rate": 3.708e-05, "elapsed_time_per_iteration": 6.69127512, "memory(GiB)": 21.51, "elapsed_time": "7h 9m 35s", "remaining_time": "4h 27m 26s", "loss_scale": 1.0, "consumed_samples": 1003776, "global_step/max_steps": "3921/6362"} +{"lm loss": 4.86899281, "grad_norm": 0.4685981, "learning_rate": 3.705e-05, "elapsed_time_per_iteration": 6.38116241, "memory(GiB)": 21.51, "elapsed_time": "7h 9m 42s", "remaining_time": "4h 27m 19s", "loss_scale": 1.0, "consumed_samples": 1004032, "global_step/max_steps": "3922/6362"} +{"lm loss": 4.88888168, "grad_norm": 0.49924475, "learning_rate": 3.703e-05, "elapsed_time_per_iteration": 6.67572069, "memory(GiB)": 21.51, "elapsed_time": "7h 9m 48s", "remaining_time": "4h 27m 13s", "loss_scale": 1.0, "consumed_samples": 1004288, "global_step/max_steps": "3923/6362"} +{"lm loss": 4.88770151, "grad_norm": 0.46955141, "learning_rate": 3.7e-05, "elapsed_time_per_iteration": 6.59253502, "memory(GiB)": 21.51, "elapsed_time": "7h 9m 55s", "remaining_time": "4h 27m 6s", "loss_scale": 1.0, "consumed_samples": 1004544, "global_step/max_steps": "3924/6362"} +{"lm loss": 4.90356636, "grad_norm": 0.4569307, "learning_rate": 3.698e-05, "elapsed_time_per_iteration": 6.76822567, "memory(GiB)": 21.51, "elapsed_time": "7h 10m 2s", "remaining_time": "4h 27m 0s", "loss_scale": 1.0, "consumed_samples": 1004800, "global_step/max_steps": "3925/6362"} +{"lm loss": 4.8885026, "grad_norm": 0.48242843, "learning_rate": 3.696e-05, "elapsed_time_per_iteration": 6.84126019, "memory(GiB)": 21.51, "elapsed_time": "7h 10m 9s", "remaining_time": "4h 26m 54s", "loss_scale": 1.0, "consumed_samples": 1005056, "global_step/max_steps": "3926/6362"} +{"lm loss": 4.88851261, "grad_norm": 0.49597654, "learning_rate": 3.693e-05, "elapsed_time_per_iteration": 6.63670111, "memory(GiB)": 21.51, "elapsed_time": "7h 10m 15s", "remaining_time": "4h 26m 47s", "loss_scale": 1.0, "consumed_samples": 1005312, "global_step/max_steps": "3927/6362"} +{"lm loss": 4.90114212, "grad_norm": 0.43801734, "learning_rate": 3.691e-05, "elapsed_time_per_iteration": 6.64493585, "memory(GiB)": 21.51, "elapsed_time": "7h 10m 22s", "remaining_time": "4h 26m 40s", "loss_scale": 1.0, "consumed_samples": 1005568, "global_step/max_steps": "3928/6362"} +{"lm loss": 4.89184999, "grad_norm": 0.46562159, "learning_rate": 3.688e-05, "elapsed_time_per_iteration": 6.67165589, "memory(GiB)": 21.51, "elapsed_time": "7h 10m 29s", "remaining_time": "4h 26m 34s", "loss_scale": 1.0, "consumed_samples": 1005824, "global_step/max_steps": "3929/6362"} +{"lm loss": 4.91975021, "grad_norm": 0.42344424, "learning_rate": 3.686e-05, "elapsed_time_per_iteration": 6.55584645, "memory(GiB)": 21.51, "elapsed_time": "7h 10m 35s", "remaining_time": "4h 26m 27s", "loss_scale": 1.0, "consumed_samples": 1006080, "global_step/max_steps": "3930/6362"} +{"lm loss": 4.91210938, "grad_norm": 0.48843777, "learning_rate": 3.684e-05, "elapsed_time_per_iteration": 6.55299544, "memory(GiB)": 21.51, "elapsed_time": "7h 10m 42s", "remaining_time": "4h 26m 21s", "loss_scale": 1.0, "consumed_samples": 1006336, "global_step/max_steps": "3931/6362"} +{"lm loss": 4.92494392, "grad_norm": 0.43729764, "learning_rate": 3.681e-05, "elapsed_time_per_iteration": 6.64891195, "memory(GiB)": 21.51, "elapsed_time": "7h 10m 48s", "remaining_time": "4h 26m 14s", "loss_scale": 1.0, "consumed_samples": 1006592, "global_step/max_steps": "3932/6362"} +{"lm loss": 4.90753841, "grad_norm": 0.46877119, "learning_rate": 3.679e-05, "elapsed_time_per_iteration": 6.43832684, "memory(GiB)": 21.51, "elapsed_time": "7h 10m 55s", "remaining_time": "4h 26m 8s", "loss_scale": 1.0, "consumed_samples": 1006848, "global_step/max_steps": "3933/6362"} +{"lm loss": 4.93231249, "grad_norm": 0.48118347, "learning_rate": 3.676e-05, "elapsed_time_per_iteration": 6.7465353, "memory(GiB)": 21.51, "elapsed_time": "7h 11m 2s", "remaining_time": "4h 26m 1s", "loss_scale": 1.0, "consumed_samples": 1007104, "global_step/max_steps": "3934/6362"} +{"lm loss": 4.91881514, "grad_norm": 0.38785982, "learning_rate": 3.674e-05, "elapsed_time_per_iteration": 6.42064953, "memory(GiB)": 21.51, "elapsed_time": "7h 11m 8s", "remaining_time": "4h 25m 54s", "loss_scale": 1.0, "consumed_samples": 1007360, "global_step/max_steps": "3935/6362"} +{"lm loss": 4.9071579, "grad_norm": 0.46603632, "learning_rate": 3.672e-05, "elapsed_time_per_iteration": 6.49496102, "memory(GiB)": 21.51, "elapsed_time": "7h 11m 15s", "remaining_time": "4h 25m 48s", "loss_scale": 1.0, "consumed_samples": 1007616, "global_step/max_steps": "3936/6362"} +{"lm loss": 4.90243483, "grad_norm": 0.47186983, "learning_rate": 3.669e-05, "elapsed_time_per_iteration": 6.61585474, "memory(GiB)": 21.51, "elapsed_time": "7h 11m 21s", "remaining_time": "4h 25m 41s", "loss_scale": 1.0, "consumed_samples": 1007872, "global_step/max_steps": "3937/6362"} +{"lm loss": 4.90467405, "grad_norm": 0.42089334, "learning_rate": 3.667e-05, "elapsed_time_per_iteration": 6.39411235, "memory(GiB)": 21.51, "elapsed_time": "7h 11m 28s", "remaining_time": "4h 25m 35s", "loss_scale": 1.0, "consumed_samples": 1008128, "global_step/max_steps": "3938/6362"} +{"lm loss": 4.8840704, "grad_norm": 0.43679461, "learning_rate": 3.664e-05, "elapsed_time_per_iteration": 6.85867739, "memory(GiB)": 21.51, "elapsed_time": "7h 11m 34s", "remaining_time": "4h 25m 28s", "loss_scale": 1.0, "consumed_samples": 1008384, "global_step/max_steps": "3939/6362"} +{"lm loss": 4.89010715, "grad_norm": 0.44757664, "learning_rate": 3.662e-05, "elapsed_time_per_iteration": 6.49005938, "memory(GiB)": 21.51, "elapsed_time": "7h 11m 41s", "remaining_time": "4h 25m 22s", "loss_scale": 1.0, "consumed_samples": 1008640, "global_step/max_steps": "3940/6362"} +{"lm loss": 4.88724899, "grad_norm": 0.44733989, "learning_rate": 3.66e-05, "elapsed_time_per_iteration": 6.65411234, "memory(GiB)": 21.51, "elapsed_time": "7h 11m 48s", "remaining_time": "4h 25m 15s", "loss_scale": 1.0, "consumed_samples": 1008896, "global_step/max_steps": "3941/6362"} +{"lm loss": 4.90025187, "grad_norm": 0.42106155, "learning_rate": 3.657e-05, "elapsed_time_per_iteration": 6.39853501, "memory(GiB)": 21.51, "elapsed_time": "7h 11m 54s", "remaining_time": "4h 25m 8s", "loss_scale": 1.0, "consumed_samples": 1009152, "global_step/max_steps": "3942/6362"} +{"lm loss": 4.90600538, "grad_norm": 0.49712107, "learning_rate": 3.655e-05, "elapsed_time_per_iteration": 6.4221499, "memory(GiB)": 21.51, "elapsed_time": "7h 12m 0s", "remaining_time": "4h 25m 2s", "loss_scale": 1.0, "consumed_samples": 1009408, "global_step/max_steps": "3943/6362"} +{"lm loss": 4.90705156, "grad_norm": 0.51504296, "learning_rate": 3.652e-05, "elapsed_time_per_iteration": 6.6294539, "memory(GiB)": 21.51, "elapsed_time": "7h 12m 7s", "remaining_time": "4h 24m 55s", "loss_scale": 1.0, "consumed_samples": 1009664, "global_step/max_steps": "3944/6362"} +{"lm loss": 4.90342951, "grad_norm": 0.45394772, "learning_rate": 3.65e-05, "elapsed_time_per_iteration": 6.57536006, "memory(GiB)": 21.51, "elapsed_time": "7h 12m 14s", "remaining_time": "4h 24m 49s", "loss_scale": 1.0, "consumed_samples": 1009920, "global_step/max_steps": "3945/6362"} +{"lm loss": 4.89165545, "grad_norm": 0.42799327, "learning_rate": 3.648e-05, "elapsed_time_per_iteration": 6.61787772, "memory(GiB)": 21.51, "elapsed_time": "7h 12m 20s", "remaining_time": "4h 24m 42s", "loss_scale": 1.0, "consumed_samples": 1010176, "global_step/max_steps": "3946/6362"} +{"lm loss": 4.89728069, "grad_norm": 0.47824094, "learning_rate": 3.645e-05, "elapsed_time_per_iteration": 6.39243484, "memory(GiB)": 21.51, "elapsed_time": "7h 12m 27s", "remaining_time": "4h 24m 35s", "loss_scale": 1.0, "consumed_samples": 1010432, "global_step/max_steps": "3947/6362"} +{"lm loss": 4.90482903, "grad_norm": 0.48270142, "learning_rate": 3.643e-05, "elapsed_time_per_iteration": 6.6874938, "memory(GiB)": 21.51, "elapsed_time": "7h 12m 33s", "remaining_time": "4h 24m 29s", "loss_scale": 1.0, "consumed_samples": 1010688, "global_step/max_steps": "3948/6362"} +{"lm loss": 4.88801289, "grad_norm": 0.42548063, "learning_rate": 3.64e-05, "elapsed_time_per_iteration": 6.62977505, "memory(GiB)": 21.51, "elapsed_time": "7h 12m 40s", "remaining_time": "4h 24m 22s", "loss_scale": 1.0, "consumed_samples": 1010944, "global_step/max_steps": "3949/6362"} +{"lm loss": 4.91250181, "grad_norm": 0.42942294, "learning_rate": 3.638e-05, "elapsed_time_per_iteration": 6.62236762, "memory(GiB)": 21.51, "elapsed_time": "7h 12m 47s", "remaining_time": "4h 24m 16s", "loss_scale": 1.0, "consumed_samples": 1011200, "global_step/max_steps": "3950/6362"} +{"lm loss": 4.90422726, "grad_norm": 0.51678687, "learning_rate": 3.636e-05, "elapsed_time_per_iteration": 6.57721257, "memory(GiB)": 21.51, "elapsed_time": "7h 12m 53s", "remaining_time": "4h 24m 9s", "loss_scale": 1.0, "consumed_samples": 1011456, "global_step/max_steps": "3951/6362"} +{"lm loss": 4.89926195, "grad_norm": 0.48461282, "learning_rate": 3.633e-05, "elapsed_time_per_iteration": 6.83285952, "memory(GiB)": 21.51, "elapsed_time": "7h 13m 0s", "remaining_time": "4h 24m 3s", "loss_scale": 1.0, "consumed_samples": 1011712, "global_step/max_steps": "3952/6362"} +{"lm loss": 4.90267611, "grad_norm": 0.48750973, "learning_rate": 3.631e-05, "elapsed_time_per_iteration": 6.65325475, "memory(GiB)": 21.51, "elapsed_time": "7h 13m 7s", "remaining_time": "4h 23m 56s", "loss_scale": 1.0, "consumed_samples": 1011968, "global_step/max_steps": "3953/6362"} +{"lm loss": 4.88783646, "grad_norm": 0.39600772, "learning_rate": 3.628e-05, "elapsed_time_per_iteration": 6.50620008, "memory(GiB)": 21.51, "elapsed_time": "7h 13m 13s", "remaining_time": "4h 23m 50s", "loss_scale": 1.0, "consumed_samples": 1012224, "global_step/max_steps": "3954/6362"} +{"lm loss": 4.90421104, "grad_norm": 0.48038146, "learning_rate": 3.626e-05, "elapsed_time_per_iteration": 6.51871705, "memory(GiB)": 21.51, "elapsed_time": "7h 13m 20s", "remaining_time": "4h 23m 43s", "loss_scale": 1.0, "consumed_samples": 1012480, "global_step/max_steps": "3955/6362"} +{"lm loss": 4.93006992, "grad_norm": 0.46820766, "learning_rate": 3.624e-05, "elapsed_time_per_iteration": 6.54071331, "memory(GiB)": 21.51, "elapsed_time": "7h 13m 26s", "remaining_time": "4h 23m 36s", "loss_scale": 1.0, "consumed_samples": 1012736, "global_step/max_steps": "3956/6362"} +{"lm loss": 4.89510489, "grad_norm": 0.40883499, "learning_rate": 3.621e-05, "elapsed_time_per_iteration": 6.78332639, "memory(GiB)": 21.51, "elapsed_time": "7h 13m 33s", "remaining_time": "4h 23m 30s", "loss_scale": 1.0, "consumed_samples": 1012992, "global_step/max_steps": "3957/6362"} +{"lm loss": 4.91555691, "grad_norm": 0.53035015, "learning_rate": 3.619e-05, "elapsed_time_per_iteration": 6.90029645, "memory(GiB)": 21.51, "elapsed_time": "7h 13m 40s", "remaining_time": "4h 23m 24s", "loss_scale": 1.0, "consumed_samples": 1013248, "global_step/max_steps": "3958/6362"} +{"lm loss": 4.89536762, "grad_norm": 0.47378922, "learning_rate": 3.616e-05, "elapsed_time_per_iteration": 6.82123137, "memory(GiB)": 21.51, "elapsed_time": "7h 13m 47s", "remaining_time": "4h 23m 17s", "loss_scale": 1.0, "consumed_samples": 1013504, "global_step/max_steps": "3959/6362"} +{"lm loss": 4.91441059, "grad_norm": 0.43087792, "learning_rate": 3.614e-05, "elapsed_time_per_iteration": 6.68409395, "memory(GiB)": 21.51, "elapsed_time": "7h 13m 53s", "remaining_time": "4h 23m 11s", "loss_scale": 1.0, "consumed_samples": 1013760, "global_step/max_steps": "3960/6362"} +{"lm loss": 4.91233492, "grad_norm": 0.51067352, "learning_rate": 3.612e-05, "elapsed_time_per_iteration": 6.4035306, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 0s", "remaining_time": "4h 23m 4s", "loss_scale": 1.0, "consumed_samples": 1014016, "global_step/max_steps": "3961/6362"} +{"lm loss": 4.90361881, "grad_norm": 0.47983822, "learning_rate": 3.609e-05, "elapsed_time_per_iteration": 6.49743795, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 6s", "remaining_time": "4h 22m 57s", "loss_scale": 1.0, "consumed_samples": 1014272, "global_step/max_steps": "3962/6362"} +{"lm loss": 4.91235971, "grad_norm": 0.45771456, "learning_rate": 3.607e-05, "elapsed_time_per_iteration": 6.4190495, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 13s", "remaining_time": "4h 22m 51s", "loss_scale": 1.0, "consumed_samples": 1014528, "global_step/max_steps": "3963/6362"} +{"lm loss": 4.90045309, "grad_norm": 0.45442635, "learning_rate": 3.604e-05, "elapsed_time_per_iteration": 6.66809535, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 19s", "remaining_time": "4h 22m 44s", "loss_scale": 1.0, "consumed_samples": 1014784, "global_step/max_steps": "3964/6362"} +{"lm loss": 4.87751055, "grad_norm": 0.53193992, "learning_rate": 3.602e-05, "elapsed_time_per_iteration": 6.51223874, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 26s", "remaining_time": "4h 22m 38s", "loss_scale": 1.0, "consumed_samples": 1015040, "global_step/max_steps": "3965/6362"} +{"lm loss": 4.91259336, "grad_norm": 0.50753683, "learning_rate": 3.6e-05, "elapsed_time_per_iteration": 6.70017719, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 33s", "remaining_time": "4h 22m 31s", "loss_scale": 1.0, "consumed_samples": 1015296, "global_step/max_steps": "3966/6362"} +{"lm loss": 4.88659859, "grad_norm": 0.37241232, "learning_rate": 3.597e-05, "elapsed_time_per_iteration": 6.66721892, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 39s", "remaining_time": "4h 22m 25s", "loss_scale": 1.0, "consumed_samples": 1015552, "global_step/max_steps": "3967/6362"} +{"lm loss": 4.91203403, "grad_norm": 0.50123364, "learning_rate": 3.595e-05, "elapsed_time_per_iteration": 6.7261095, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 46s", "remaining_time": "4h 22m 18s", "loss_scale": 1.0, "consumed_samples": 1015808, "global_step/max_steps": "3968/6362"} +{"lm loss": 4.89635277, "grad_norm": 0.37510827, "learning_rate": 3.593e-05, "elapsed_time_per_iteration": 6.72001195, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 53s", "remaining_time": "4h 22m 12s", "loss_scale": 1.0, "consumed_samples": 1016064, "global_step/max_steps": "3969/6362"} +{"lm loss": 4.89496899, "grad_norm": 0.45517975, "learning_rate": 3.59e-05, "elapsed_time_per_iteration": 6.57141662, "memory(GiB)": 21.51, "elapsed_time": "7h 14m 59s", "remaining_time": "4h 22m 5s", "loss_scale": 1.0, "consumed_samples": 1016320, "global_step/max_steps": "3970/6362"} +{"lm loss": 4.88376236, "grad_norm": 0.40396523, "learning_rate": 3.588e-05, "elapsed_time_per_iteration": 6.59027267, "memory(GiB)": 21.51, "elapsed_time": "7h 15m 6s", "remaining_time": "4h 21m 59s", "loss_scale": 1.0, "consumed_samples": 1016576, "global_step/max_steps": "3971/6362"} +{"lm loss": 4.90363407, "grad_norm": 0.42558801, "learning_rate": 3.585e-05, "elapsed_time_per_iteration": 6.45196605, "memory(GiB)": 21.51, "elapsed_time": "7h 15m 12s", "remaining_time": "4h 21m 52s", "loss_scale": 1.0, "consumed_samples": 1016832, "global_step/max_steps": "3972/6362"} +{"lm loss": 4.90233183, "grad_norm": 0.42516625, "learning_rate": 3.583e-05, "elapsed_time_per_iteration": 6.65644193, "memory(GiB)": 21.51, "elapsed_time": "7h 15m 19s", "remaining_time": "4h 21m 45s", "loss_scale": 1.0, "consumed_samples": 1017088, "global_step/max_steps": "3973/6362"} +{"lm loss": 4.89364624, "grad_norm": 0.4199076, "learning_rate": 3.581e-05, "elapsed_time_per_iteration": 6.68097782, "memory(GiB)": 21.51, "elapsed_time": "7h 15m 26s", "remaining_time": "4h 21m 39s", "loss_scale": 1.0, "consumed_samples": 1017344, "global_step/max_steps": "3974/6362"} +{"lm loss": 4.89999914, "grad_norm": 0.46403921, "learning_rate": 3.578e-05, "elapsed_time_per_iteration": 6.61739016, "memory(GiB)": 21.51, "elapsed_time": "7h 15m 32s", "remaining_time": "4h 21m 32s", "loss_scale": 1.0, "consumed_samples": 1017600, "global_step/max_steps": "3975/6362"} +{"lm loss": 4.90591717, "grad_norm": 0.42662647, "learning_rate": 3.576e-05, "elapsed_time_per_iteration": 6.50289536, "memory(GiB)": 21.51, "elapsed_time": "7h 15m 39s", "remaining_time": "4h 21m 26s", "loss_scale": 1.0, "consumed_samples": 1017856, "global_step/max_steps": "3976/6362"} +{"lm loss": 4.89817047, "grad_norm": 0.43465689, "learning_rate": 3.573e-05, "elapsed_time_per_iteration": 6.62054658, "memory(GiB)": 21.51, "elapsed_time": "7h 15m 45s", "remaining_time": "4h 21m 19s", "loss_scale": 1.0, "consumed_samples": 1018112, "global_step/max_steps": "3977/6362"} +{"lm loss": 4.90360403, "grad_norm": 0.42179233, "learning_rate": 3.571e-05, "elapsed_time_per_iteration": 6.38290501, "memory(GiB)": 21.51, "elapsed_time": "7h 15m 52s", "remaining_time": "4h 21m 12s", "loss_scale": 1.0, "consumed_samples": 1018368, "global_step/max_steps": "3978/6362"} +{"lm loss": 4.90985632, "grad_norm": 0.43740857, "learning_rate": 3.569e-05, "elapsed_time_per_iteration": 6.42688441, "memory(GiB)": 21.51, "elapsed_time": "7h 15m 58s", "remaining_time": "4h 21m 6s", "loss_scale": 1.0, "consumed_samples": 1018624, "global_step/max_steps": "3979/6362"} +{"lm loss": 4.88798809, "grad_norm": 0.41268283, "learning_rate": 3.566e-05, "elapsed_time_per_iteration": 6.48031545, "memory(GiB)": 21.51, "elapsed_time": "7h 16m 5s", "remaining_time": "4h 20m 59s", "loss_scale": 1.0, "consumed_samples": 1018880, "global_step/max_steps": "3980/6362"} +{"lm loss": 4.88781118, "grad_norm": 0.41111082, "learning_rate": 3.564e-05, "elapsed_time_per_iteration": 6.50719953, "memory(GiB)": 21.51, "elapsed_time": "7h 16m 11s", "remaining_time": "4h 20m 53s", "loss_scale": 1.0, "consumed_samples": 1019136, "global_step/max_steps": "3981/6362"} +{"lm loss": 4.90329361, "grad_norm": 0.384821, "learning_rate": 3.562e-05, "elapsed_time_per_iteration": 6.74748373, "memory(GiB)": 21.51, "elapsed_time": "7h 16m 18s", "remaining_time": "4h 20m 46s", "loss_scale": 1.0, "consumed_samples": 1019392, "global_step/max_steps": "3982/6362"} +{"lm loss": 4.87926006, "grad_norm": 0.44825521, "learning_rate": 3.559e-05, "elapsed_time_per_iteration": 6.58321357, "memory(GiB)": 21.51, "elapsed_time": "7h 16m 24s", "remaining_time": "4h 20m 39s", "loss_scale": 1.0, "consumed_samples": 1019648, "global_step/max_steps": "3983/6362"} +{"lm loss": 4.87083864, "grad_norm": 0.43417582, "learning_rate": 3.557e-05, "elapsed_time_per_iteration": 6.47654223, "memory(GiB)": 21.51, "elapsed_time": "7h 16m 31s", "remaining_time": "4h 20m 33s", "loss_scale": 1.0, "consumed_samples": 1019904, "global_step/max_steps": "3984/6362"} +{"lm loss": 4.91569185, "grad_norm": 0.415301, "learning_rate": 3.554e-05, "elapsed_time_per_iteration": 6.59092164, "memory(GiB)": 21.51, "elapsed_time": "7h 16m 38s", "remaining_time": "4h 20m 26s", "loss_scale": 1.0, "consumed_samples": 1020160, "global_step/max_steps": "3985/6362"} +{"lm loss": 4.91737795, "grad_norm": 0.42523423, "learning_rate": 3.552e-05, "elapsed_time_per_iteration": 6.4232173, "memory(GiB)": 21.51, "elapsed_time": "7h 16m 44s", "remaining_time": "4h 20m 20s", "loss_scale": 1.0, "consumed_samples": 1020416, "global_step/max_steps": "3986/6362"} +{"lm loss": 4.91355896, "grad_norm": 0.41499484, "learning_rate": 3.55e-05, "elapsed_time_per_iteration": 6.58463216, "memory(GiB)": 21.51, "elapsed_time": "7h 16m 51s", "remaining_time": "4h 20m 13s", "loss_scale": 1.0, "consumed_samples": 1020672, "global_step/max_steps": "3987/6362"} +{"lm loss": 4.8845973, "grad_norm": 0.44498885, "learning_rate": 3.547e-05, "elapsed_time_per_iteration": 6.67348552, "memory(GiB)": 21.51, "elapsed_time": "7h 16m 57s", "remaining_time": "4h 20m 7s", "loss_scale": 1.0, "consumed_samples": 1020928, "global_step/max_steps": "3988/6362"} +{"lm loss": 4.8977313, "grad_norm": 0.41918263, "learning_rate": 3.545e-05, "elapsed_time_per_iteration": 6.46088433, "memory(GiB)": 21.51, "elapsed_time": "7h 17m 4s", "remaining_time": "4h 20m 0s", "loss_scale": 1.0, "consumed_samples": 1021184, "global_step/max_steps": "3989/6362"} +{"lm loss": 4.88869047, "grad_norm": 0.42457354, "learning_rate": 3.542e-05, "elapsed_time_per_iteration": 6.76585793, "memory(GiB)": 21.51, "elapsed_time": "7h 17m 10s", "remaining_time": "4h 19m 53s", "loss_scale": 1.0, "consumed_samples": 1021440, "global_step/max_steps": "3990/6362"} +{"lm loss": 4.89352036, "grad_norm": 0.4318009, "learning_rate": 3.54e-05, "elapsed_time_per_iteration": 6.69236517, "memory(GiB)": 21.51, "elapsed_time": "7h 17m 17s", "remaining_time": "4h 19m 47s", "loss_scale": 1.0, "consumed_samples": 1021696, "global_step/max_steps": "3991/6362"} +{"lm loss": 4.878335, "grad_norm": 0.46351889, "learning_rate": 3.538e-05, "elapsed_time_per_iteration": 6.55505443, "memory(GiB)": 21.51, "elapsed_time": "7h 17m 24s", "remaining_time": "4h 19m 40s", "loss_scale": 1.0, "consumed_samples": 1021952, "global_step/max_steps": "3992/6362"} +{"lm loss": 4.91618109, "grad_norm": 0.46573904, "learning_rate": 3.535e-05, "elapsed_time_per_iteration": 6.62379146, "memory(GiB)": 21.51, "elapsed_time": "7h 17m 30s", "remaining_time": "4h 19m 34s", "loss_scale": 1.0, "consumed_samples": 1022208, "global_step/max_steps": "3993/6362"} +{"lm loss": 4.92383146, "grad_norm": 0.4227607, "learning_rate": 3.533e-05, "elapsed_time_per_iteration": 6.81342196, "memory(GiB)": 21.51, "elapsed_time": "7h 17m 37s", "remaining_time": "4h 19m 27s", "loss_scale": 1.0, "consumed_samples": 1022464, "global_step/max_steps": "3994/6362"} +{"lm loss": 4.88484478, "grad_norm": 0.42118555, "learning_rate": 3.531e-05, "elapsed_time_per_iteration": 6.6141572, "memory(GiB)": 21.51, "elapsed_time": "7h 17m 44s", "remaining_time": "4h 19m 21s", "loss_scale": 1.0, "consumed_samples": 1022720, "global_step/max_steps": "3995/6362"} +{"lm loss": 4.89323044, "grad_norm": 0.42003042, "learning_rate": 3.528e-05, "elapsed_time_per_iteration": 6.57761502, "memory(GiB)": 21.51, "elapsed_time": "7h 17m 50s", "remaining_time": "4h 19m 14s", "loss_scale": 1.0, "consumed_samples": 1022976, "global_step/max_steps": "3996/6362"} +{"lm loss": 4.89809608, "grad_norm": 0.42745113, "learning_rate": 3.526e-05, "elapsed_time_per_iteration": 6.68845415, "memory(GiB)": 21.51, "elapsed_time": "7h 17m 57s", "remaining_time": "4h 19m 8s", "loss_scale": 1.0, "consumed_samples": 1023232, "global_step/max_steps": "3997/6362"} +{"lm loss": 4.88483572, "grad_norm": 0.39848071, "learning_rate": 3.523e-05, "elapsed_time_per_iteration": 6.55055118, "memory(GiB)": 21.51, "elapsed_time": "7h 18m 4s", "remaining_time": "4h 19m 1s", "loss_scale": 1.0, "consumed_samples": 1023488, "global_step/max_steps": "3998/6362"} +{"lm loss": 4.90719509, "grad_norm": 0.41025558, "learning_rate": 3.521e-05, "elapsed_time_per_iteration": 6.62265491, "memory(GiB)": 21.51, "elapsed_time": "7h 18m 10s", "remaining_time": "4h 18m 55s", "loss_scale": 1.0, "consumed_samples": 1023744, "global_step/max_steps": "3999/6362"} +{"lm loss": 4.88195419, "grad_norm": 0.45600834, "learning_rate": 3.519e-05, "elapsed_time_per_iteration": 6.63010478, "memory(GiB)": 21.51, "elapsed_time": "7h 18m 17s", "remaining_time": "4h 18m 48s", "loss_scale": 1.0, "consumed_samples": 1024000, "global_step/max_steps": "4000/6362"} +{"lm loss": 4.90349579, "grad_norm": 0.41239592, "learning_rate": 3.516e-05, "elapsed_time_per_iteration": 6.45741653, "memory(GiB)": 21.51, "elapsed_time": "7h 18m 23s", "remaining_time": "4h 18m 41s", "loss_scale": 1.0, "consumed_samples": 1024256, "global_step/max_steps": "4001/6362"} +{"lm loss": 4.93072414, "grad_norm": 0.3921797, "learning_rate": 3.514e-05, "elapsed_time_per_iteration": 6.68504906, "memory(GiB)": 21.51, "elapsed_time": "7h 18m 30s", "remaining_time": "4h 18m 35s", "loss_scale": 1.0, "consumed_samples": 1024512, "global_step/max_steps": "4002/6362"} +{"lm loss": 4.8906126, "grad_norm": 0.45569283, "learning_rate": 3.512e-05, "elapsed_time_per_iteration": 6.44666672, "memory(GiB)": 21.51, "elapsed_time": "7h 18m 36s", "remaining_time": "4h 18m 28s", "loss_scale": 1.0, "consumed_samples": 1024768, "global_step/max_steps": "4003/6362"} +{"lm loss": 4.8848753, "grad_norm": 0.42936671, "learning_rate": 3.509e-05, "elapsed_time_per_iteration": 6.73356032, "memory(GiB)": 21.51, "elapsed_time": "7h 18m 43s", "remaining_time": "4h 18m 22s", "loss_scale": 1.0, "consumed_samples": 1025024, "global_step/max_steps": "4004/6362"} +{"lm loss": 4.8981185, "grad_norm": 0.42076173, "learning_rate": 3.507e-05, "elapsed_time_per_iteration": 6.50776148, "memory(GiB)": 21.51, "elapsed_time": "7h 18m 50s", "remaining_time": "4h 18m 15s", "loss_scale": 1.0, "consumed_samples": 1025280, "global_step/max_steps": "4005/6362"} +{"lm loss": 4.91563892, "grad_norm": 0.41373923, "learning_rate": 3.504e-05, "elapsed_time_per_iteration": 6.55977845, "memory(GiB)": 21.51, "elapsed_time": "7h 18m 56s", "remaining_time": "4h 18m 9s", "loss_scale": 1.0, "consumed_samples": 1025536, "global_step/max_steps": "4006/6362"} +{"lm loss": 4.8668313, "grad_norm": 0.46255854, "learning_rate": 3.502e-05, "elapsed_time_per_iteration": 6.72359252, "memory(GiB)": 21.51, "elapsed_time": "7h 19m 3s", "remaining_time": "4h 18m 2s", "loss_scale": 1.0, "consumed_samples": 1025792, "global_step/max_steps": "4007/6362"} +{"lm loss": 4.90805149, "grad_norm": 0.4732317, "learning_rate": 3.5e-05, "elapsed_time_per_iteration": 6.77818298, "memory(GiB)": 21.51, "elapsed_time": "7h 19m 10s", "remaining_time": "4h 17m 56s", "loss_scale": 1.0, "consumed_samples": 1026048, "global_step/max_steps": "4008/6362"} +{"lm loss": 4.90067482, "grad_norm": 0.41633546, "learning_rate": 3.497e-05, "elapsed_time_per_iteration": 6.54730177, "memory(GiB)": 21.51, "elapsed_time": "7h 19m 16s", "remaining_time": "4h 17m 49s", "loss_scale": 1.0, "consumed_samples": 1026304, "global_step/max_steps": "4009/6362"} +{"lm loss": 4.90028238, "grad_norm": 0.45511851, "learning_rate": 3.495e-05, "elapsed_time_per_iteration": 6.58047032, "memory(GiB)": 21.51, "elapsed_time": "7h 19m 23s", "remaining_time": "4h 17m 42s", "loss_scale": 1.0, "consumed_samples": 1026560, "global_step/max_steps": "4010/6362"} +{"lm loss": 4.88082266, "grad_norm": 0.45198336, "learning_rate": 3.493e-05, "elapsed_time_per_iteration": 6.46321511, "memory(GiB)": 21.51, "elapsed_time": "7h 19m 29s", "remaining_time": "4h 17m 36s", "loss_scale": 1.0, "consumed_samples": 1026816, "global_step/max_steps": "4011/6362"} +{"lm loss": 4.8897562, "grad_norm": 0.43742034, "learning_rate": 3.49e-05, "elapsed_time_per_iteration": 6.77400327, "memory(GiB)": 21.51, "elapsed_time": "7h 19m 36s", "remaining_time": "4h 17m 29s", "loss_scale": 1.0, "consumed_samples": 1027072, "global_step/max_steps": "4012/6362"} +{"lm loss": 4.90348339, "grad_norm": 0.43803623, "learning_rate": 3.488e-05, "elapsed_time_per_iteration": 6.55833983, "memory(GiB)": 21.51, "elapsed_time": "7h 19m 43s", "remaining_time": "4h 17m 23s", "loss_scale": 1.0, "consumed_samples": 1027328, "global_step/max_steps": "4013/6362"} +{"lm loss": 4.91473246, "grad_norm": 0.46533403, "learning_rate": 3.486e-05, "elapsed_time_per_iteration": 6.58729982, "memory(GiB)": 21.51, "elapsed_time": "7h 19m 49s", "remaining_time": "4h 17m 16s", "loss_scale": 1.0, "consumed_samples": 1027584, "global_step/max_steps": "4014/6362"} +{"lm loss": 4.90762043, "grad_norm": 0.45482206, "learning_rate": 3.483e-05, "elapsed_time_per_iteration": 6.46595335, "memory(GiB)": 21.51, "elapsed_time": "7h 19m 56s", "remaining_time": "4h 17m 10s", "loss_scale": 1.0, "consumed_samples": 1027840, "global_step/max_steps": "4015/6362"} +{"lm loss": 4.91349125, "grad_norm": 0.42420068, "learning_rate": 3.481e-05, "elapsed_time_per_iteration": 6.66739702, "memory(GiB)": 21.51, "elapsed_time": "7h 20m 2s", "remaining_time": "4h 17m 3s", "loss_scale": 1.0, "consumed_samples": 1028096, "global_step/max_steps": "4016/6362"} +{"lm loss": 4.90047216, "grad_norm": 0.37445602, "learning_rate": 3.478e-05, "elapsed_time_per_iteration": 6.87531686, "memory(GiB)": 21.51, "elapsed_time": "7h 20m 9s", "remaining_time": "4h 16m 57s", "loss_scale": 1.0, "consumed_samples": 1028352, "global_step/max_steps": "4017/6362"} +{"lm loss": 4.90662289, "grad_norm": 0.46355516, "learning_rate": 3.476e-05, "elapsed_time_per_iteration": 6.51155639, "memory(GiB)": 21.51, "elapsed_time": "7h 20m 16s", "remaining_time": "4h 16m 50s", "loss_scale": 1.0, "consumed_samples": 1028608, "global_step/max_steps": "4018/6362"} +{"lm loss": 4.90305996, "grad_norm": 0.41056424, "learning_rate": 3.474e-05, "elapsed_time_per_iteration": 6.60663509, "memory(GiB)": 21.51, "elapsed_time": "7h 20m 22s", "remaining_time": "4h 16m 44s", "loss_scale": 1.0, "consumed_samples": 1028864, "global_step/max_steps": "4019/6362"} +{"lm loss": 4.91280985, "grad_norm": 0.42724857, "learning_rate": 3.471e-05, "elapsed_time_per_iteration": 6.73016214, "memory(GiB)": 21.51, "elapsed_time": "7h 20m 29s", "remaining_time": "4h 16m 37s", "loss_scale": 1.0, "consumed_samples": 1029120, "global_step/max_steps": "4020/6362"} +{"lm loss": 4.89630318, "grad_norm": 0.40483505, "learning_rate": 3.469e-05, "elapsed_time_per_iteration": 6.5934515, "memory(GiB)": 21.51, "elapsed_time": "7h 20m 36s", "remaining_time": "4h 16m 30s", "loss_scale": 1.0, "consumed_samples": 1029376, "global_step/max_steps": "4021/6362"} +{"lm loss": 4.91249514, "grad_norm": 0.38583469, "learning_rate": 3.467e-05, "elapsed_time_per_iteration": 6.57975578, "memory(GiB)": 21.51, "elapsed_time": "7h 20m 42s", "remaining_time": "4h 16m 24s", "loss_scale": 1.0, "consumed_samples": 1029632, "global_step/max_steps": "4022/6362"} +{"lm loss": 4.89677238, "grad_norm": 0.42849624, "learning_rate": 3.464e-05, "elapsed_time_per_iteration": 6.7340939, "memory(GiB)": 21.51, "elapsed_time": "7h 20m 49s", "remaining_time": "4h 16m 17s", "loss_scale": 1.0, "consumed_samples": 1029888, "global_step/max_steps": "4023/6362"} +{"lm loss": 4.92007637, "grad_norm": 0.41406539, "learning_rate": 3.462e-05, "elapsed_time_per_iteration": 6.7437079, "memory(GiB)": 21.51, "elapsed_time": "7h 20m 56s", "remaining_time": "4h 16m 11s", "loss_scale": 1.0, "consumed_samples": 1030144, "global_step/max_steps": "4024/6362"} +{"lm loss": 4.88824463, "grad_norm": 0.36230209, "learning_rate": 3.459e-05, "elapsed_time_per_iteration": 6.47285628, "memory(GiB)": 21.51, "elapsed_time": "7h 21m 2s", "remaining_time": "4h 16m 4s", "loss_scale": 1.0, "consumed_samples": 1030400, "global_step/max_steps": "4025/6362"} +{"lm loss": 4.90052128, "grad_norm": 0.43436003, "learning_rate": 3.457e-05, "elapsed_time_per_iteration": 6.75969672, "memory(GiB)": 21.51, "elapsed_time": "7h 21m 9s", "remaining_time": "4h 15m 58s", "loss_scale": 1.0, "consumed_samples": 1030656, "global_step/max_steps": "4026/6362"} +{"lm loss": 4.89582491, "grad_norm": 0.41410542, "learning_rate": 3.455e-05, "elapsed_time_per_iteration": 6.57340789, "memory(GiB)": 21.51, "elapsed_time": "7h 21m 16s", "remaining_time": "4h 15m 51s", "loss_scale": 1.0, "consumed_samples": 1030912, "global_step/max_steps": "4027/6362"} +{"lm loss": 4.88764811, "grad_norm": 0.43037537, "learning_rate": 3.452e-05, "elapsed_time_per_iteration": 6.54072118, "memory(GiB)": 21.51, "elapsed_time": "7h 21m 22s", "remaining_time": "4h 15m 45s", "loss_scale": 1.0, "consumed_samples": 1031168, "global_step/max_steps": "4028/6362"} +{"lm loss": 4.91139317, "grad_norm": 0.43250147, "learning_rate": 3.45e-05, "elapsed_time_per_iteration": 6.58360219, "memory(GiB)": 21.51, "elapsed_time": "7h 21m 29s", "remaining_time": "4h 15m 38s", "loss_scale": 1.0, "consumed_samples": 1031424, "global_step/max_steps": "4029/6362"} +{"lm loss": 4.90815115, "grad_norm": 0.38897011, "learning_rate": 3.448e-05, "elapsed_time_per_iteration": 6.70093441, "memory(GiB)": 21.51, "elapsed_time": "7h 21m 35s", "remaining_time": "4h 15m 32s", "loss_scale": 1.0, "consumed_samples": 1031680, "global_step/max_steps": "4030/6362"} +{"lm loss": 4.89092922, "grad_norm": 0.41272441, "learning_rate": 3.445e-05, "elapsed_time_per_iteration": 6.52274537, "memory(GiB)": 21.51, "elapsed_time": "7h 21m 42s", "remaining_time": "4h 15m 25s", "loss_scale": 1.0, "consumed_samples": 1031936, "global_step/max_steps": "4031/6362"} +{"lm loss": 4.89264822, "grad_norm": 0.43043634, "learning_rate": 3.443e-05, "elapsed_time_per_iteration": 6.44615149, "memory(GiB)": 21.51, "elapsed_time": "7h 21m 48s", "remaining_time": "4h 15m 18s", "loss_scale": 1.0, "consumed_samples": 1032192, "global_step/max_steps": "4032/6362"} +{"lm loss": 4.90405416, "grad_norm": 0.4061943, "learning_rate": 3.441e-05, "elapsed_time_per_iteration": 6.4794848, "memory(GiB)": 21.51, "elapsed_time": "7h 21m 55s", "remaining_time": "4h 15m 12s", "loss_scale": 1.0, "consumed_samples": 1032448, "global_step/max_steps": "4033/6362"} +{"lm loss": 4.89977217, "grad_norm": 0.45932734, "learning_rate": 3.438e-05, "elapsed_time_per_iteration": 6.30158901, "memory(GiB)": 21.51, "elapsed_time": "7h 22m 1s", "remaining_time": "4h 15m 5s", "loss_scale": 1.0, "consumed_samples": 1032704, "global_step/max_steps": "4034/6362"} +{"lm loss": 4.90770721, "grad_norm": 0.36737284, "learning_rate": 3.436e-05, "elapsed_time_per_iteration": 6.41046309, "memory(GiB)": 21.51, "elapsed_time": "7h 22m 7s", "remaining_time": "4h 14m 58s", "loss_scale": 1.0, "consumed_samples": 1032960, "global_step/max_steps": "4035/6362"} +{"lm loss": 4.91324091, "grad_norm": 0.39633447, "learning_rate": 3.434e-05, "elapsed_time_per_iteration": 6.42361856, "memory(GiB)": 21.51, "elapsed_time": "7h 22m 14s", "remaining_time": "4h 14m 52s", "loss_scale": 1.0, "consumed_samples": 1033216, "global_step/max_steps": "4036/6362"} +{"lm loss": 4.85036945, "grad_norm": 0.44956657, "learning_rate": 3.431e-05, "elapsed_time_per_iteration": 6.6359663, "memory(GiB)": 21.51, "elapsed_time": "7h 22m 21s", "remaining_time": "4h 14m 45s", "loss_scale": 1.0, "consumed_samples": 1033472, "global_step/max_steps": "4037/6362"} +{"lm loss": 4.89622307, "grad_norm": 0.43346849, "learning_rate": 3.429e-05, "elapsed_time_per_iteration": 6.51071429, "memory(GiB)": 21.51, "elapsed_time": "7h 22m 27s", "remaining_time": "4h 14m 38s", "loss_scale": 1.0, "consumed_samples": 1033728, "global_step/max_steps": "4038/6362"} +{"lm loss": 4.90148401, "grad_norm": 0.42699307, "learning_rate": 3.426e-05, "elapsed_time_per_iteration": 6.57524467, "memory(GiB)": 21.51, "elapsed_time": "7h 22m 34s", "remaining_time": "4h 14m 32s", "loss_scale": 1.0, "consumed_samples": 1033984, "global_step/max_steps": "4039/6362"} +{"lm loss": 4.87621546, "grad_norm": 0.39493969, "learning_rate": 3.424e-05, "elapsed_time_per_iteration": 6.56749439, "memory(GiB)": 21.51, "elapsed_time": "7h 22m 40s", "remaining_time": "4h 14m 25s", "loss_scale": 1.0, "consumed_samples": 1034240, "global_step/max_steps": "4040/6362"} +{"lm loss": 4.88047218, "grad_norm": 0.42360002, "learning_rate": 3.422e-05, "elapsed_time_per_iteration": 6.76892948, "memory(GiB)": 21.51, "elapsed_time": "7h 22m 47s", "remaining_time": "4h 14m 19s", "loss_scale": 1.0, "consumed_samples": 1034496, "global_step/max_steps": "4041/6362"} +{"lm loss": 4.91319132, "grad_norm": 0.39245954, "learning_rate": 3.419e-05, "elapsed_time_per_iteration": 6.58491993, "memory(GiB)": 21.51, "elapsed_time": "7h 22m 54s", "remaining_time": "4h 14m 12s", "loss_scale": 1.0, "consumed_samples": 1034752, "global_step/max_steps": "4042/6362"} +{"lm loss": 4.92085218, "grad_norm": 0.41275528, "learning_rate": 3.417e-05, "elapsed_time_per_iteration": 6.40167212, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 0s", "remaining_time": "4h 14m 6s", "loss_scale": 1.0, "consumed_samples": 1035008, "global_step/max_steps": "4043/6362"} +{"lm loss": 4.8917098, "grad_norm": 0.44230223, "learning_rate": 3.415e-05, "elapsed_time_per_iteration": 6.58659697, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 7s", "remaining_time": "4h 13m 59s", "loss_scale": 1.0, "consumed_samples": 1035264, "global_step/max_steps": "4044/6362"} +{"lm loss": 4.88324547, "grad_norm": 0.43786731, "learning_rate": 3.412e-05, "elapsed_time_per_iteration": 6.50770545, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 13s", "remaining_time": "4h 13m 52s", "loss_scale": 1.0, "consumed_samples": 1035520, "global_step/max_steps": "4045/6362"} +{"lm loss": 4.89947176, "grad_norm": 0.38956109, "learning_rate": 3.41e-05, "elapsed_time_per_iteration": 6.69519186, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 20s", "remaining_time": "4h 13m 46s", "loss_scale": 1.0, "consumed_samples": 1035776, "global_step/max_steps": "4046/6362"} +{"lm loss": 4.89724779, "grad_norm": 0.42683208, "learning_rate": 3.408e-05, "elapsed_time_per_iteration": 6.39735556, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 26s", "remaining_time": "4h 13m 39s", "loss_scale": 1.0, "consumed_samples": 1036032, "global_step/max_steps": "4047/6362"} +{"lm loss": 4.88414335, "grad_norm": 0.43275017, "learning_rate": 3.405e-05, "elapsed_time_per_iteration": 6.46194482, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 33s", "remaining_time": "4h 13m 33s", "loss_scale": 1.0, "consumed_samples": 1036288, "global_step/max_steps": "4048/6362"} +{"lm loss": 4.88209057, "grad_norm": 0.45635405, "learning_rate": 3.403e-05, "elapsed_time_per_iteration": 6.75451922, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 39s", "remaining_time": "4h 13m 26s", "loss_scale": 1.0, "consumed_samples": 1036544, "global_step/max_steps": "4049/6362"} +{"lm loss": 4.86863804, "grad_norm": 0.4456338, "learning_rate": 3.401e-05, "elapsed_time_per_iteration": 6.47941732, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 46s", "remaining_time": "4h 13m 20s", "loss_scale": 1.0, "consumed_samples": 1036800, "global_step/max_steps": "4050/6362"} +{"lm loss": 4.89776564, "grad_norm": 0.42475066, "learning_rate": 3.398e-05, "elapsed_time_per_iteration": 6.63974643, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 52s", "remaining_time": "4h 13m 13s", "loss_scale": 1.0, "consumed_samples": 1037056, "global_step/max_steps": "4051/6362"} +{"lm loss": 4.87904882, "grad_norm": 0.40892774, "learning_rate": 3.396e-05, "elapsed_time_per_iteration": 6.52907562, "memory(GiB)": 21.51, "elapsed_time": "7h 23m 59s", "remaining_time": "4h 13m 6s", "loss_scale": 1.0, "consumed_samples": 1037312, "global_step/max_steps": "4052/6362"} +{"lm loss": 4.91914082, "grad_norm": 0.45398885, "learning_rate": 3.394e-05, "elapsed_time_per_iteration": 6.74159598, "memory(GiB)": 21.51, "elapsed_time": "7h 24m 6s", "remaining_time": "4h 13m 0s", "loss_scale": 1.0, "consumed_samples": 1037568, "global_step/max_steps": "4053/6362"} +{"lm loss": 4.90155935, "grad_norm": 0.455975, "learning_rate": 3.391e-05, "elapsed_time_per_iteration": 6.64816332, "memory(GiB)": 21.51, "elapsed_time": "7h 24m 12s", "remaining_time": "4h 12m 53s", "loss_scale": 1.0, "consumed_samples": 1037824, "global_step/max_steps": "4054/6362"} +{"lm loss": 4.89742899, "grad_norm": 0.40491384, "learning_rate": 3.389e-05, "elapsed_time_per_iteration": 6.58972359, "memory(GiB)": 21.51, "elapsed_time": "7h 24m 19s", "remaining_time": "4h 12m 47s", "loss_scale": 1.0, "consumed_samples": 1038080, "global_step/max_steps": "4055/6362"} +{"lm loss": 4.88734579, "grad_norm": 0.44654152, "learning_rate": 3.386e-05, "elapsed_time_per_iteration": 6.5575192, "memory(GiB)": 21.51, "elapsed_time": "7h 24m 26s", "remaining_time": "4h 12m 40s", "loss_scale": 1.0, "consumed_samples": 1038336, "global_step/max_steps": "4056/6362"} +{"lm loss": 4.92056322, "grad_norm": 0.48131022, "learning_rate": 3.384e-05, "elapsed_time_per_iteration": 6.60340142, "memory(GiB)": 21.51, "elapsed_time": "7h 24m 32s", "remaining_time": "4h 12m 34s", "loss_scale": 1.0, "consumed_samples": 1038592, "global_step/max_steps": "4057/6362"} +{"lm loss": 4.90910959, "grad_norm": 0.47269887, "learning_rate": 3.382e-05, "elapsed_time_per_iteration": 6.7452147, "memory(GiB)": 21.51, "elapsed_time": "7h 24m 39s", "remaining_time": "4h 12m 27s", "loss_scale": 1.0, "consumed_samples": 1038848, "global_step/max_steps": "4058/6362"} +{"lm loss": 4.89553976, "grad_norm": 0.41841549, "learning_rate": 3.379e-05, "elapsed_time_per_iteration": 6.47522855, "memory(GiB)": 21.51, "elapsed_time": "7h 24m 45s", "remaining_time": "4h 12m 21s", "loss_scale": 1.0, "consumed_samples": 1039104, "global_step/max_steps": "4059/6362"} +{"lm loss": 4.92392683, "grad_norm": 0.47994491, "learning_rate": 3.377e-05, "elapsed_time_per_iteration": 6.55475259, "memory(GiB)": 21.51, "elapsed_time": "7h 24m 52s", "remaining_time": "4h 12m 14s", "loss_scale": 1.0, "consumed_samples": 1039360, "global_step/max_steps": "4060/6362"} +{"lm loss": 4.89247608, "grad_norm": 0.45801553, "learning_rate": 3.375e-05, "elapsed_time_per_iteration": 6.57219052, "memory(GiB)": 21.51, "elapsed_time": "7h 24m 59s", "remaining_time": "4h 12m 7s", "loss_scale": 1.0, "consumed_samples": 1039616, "global_step/max_steps": "4061/6362"} +{"lm loss": 4.90432215, "grad_norm": 0.44069016, "learning_rate": 3.372e-05, "elapsed_time_per_iteration": 6.5012114, "memory(GiB)": 21.51, "elapsed_time": "7h 25m 5s", "remaining_time": "4h 12m 1s", "loss_scale": 1.0, "consumed_samples": 1039872, "global_step/max_steps": "4062/6362"} +{"lm loss": 4.90818882, "grad_norm": 0.40177786, "learning_rate": 3.37e-05, "elapsed_time_per_iteration": 6.57551312, "memory(GiB)": 21.51, "elapsed_time": "7h 25m 12s", "remaining_time": "4h 11m 54s", "loss_scale": 1.0, "consumed_samples": 1040128, "global_step/max_steps": "4063/6362"} +{"lm loss": 4.90142775, "grad_norm": 0.47001392, "learning_rate": 3.368e-05, "elapsed_time_per_iteration": 6.52887082, "memory(GiB)": 21.51, "elapsed_time": "7h 25m 18s", "remaining_time": "4h 11m 48s", "loss_scale": 1.0, "consumed_samples": 1040384, "global_step/max_steps": "4064/6362"} +{"lm loss": 4.90687799, "grad_norm": 0.42271873, "learning_rate": 3.365e-05, "elapsed_time_per_iteration": 6.64309597, "memory(GiB)": 21.51, "elapsed_time": "7h 25m 25s", "remaining_time": "4h 11m 41s", "loss_scale": 1.0, "consumed_samples": 1040640, "global_step/max_steps": "4065/6362"} +{"lm loss": 4.91116285, "grad_norm": 0.44490263, "learning_rate": 3.363e-05, "elapsed_time_per_iteration": 7.14635181, "memory(GiB)": 21.51, "elapsed_time": "7h 25m 32s", "remaining_time": "4h 11m 35s", "loss_scale": 1.0, "consumed_samples": 1040896, "global_step/max_steps": "4066/6362"} +{"lm loss": 4.89784908, "grad_norm": 0.44116452, "learning_rate": 3.361e-05, "elapsed_time_per_iteration": 6.45132184, "memory(GiB)": 21.51, "elapsed_time": "7h 25m 38s", "remaining_time": "4h 11m 28s", "loss_scale": 1.0, "consumed_samples": 1041152, "global_step/max_steps": "4067/6362"} +{"lm loss": 4.91602421, "grad_norm": 0.43735197, "learning_rate": 3.358e-05, "elapsed_time_per_iteration": 6.47827244, "memory(GiB)": 21.51, "elapsed_time": "7h 25m 45s", "remaining_time": "4h 11m 22s", "loss_scale": 1.0, "consumed_samples": 1041408, "global_step/max_steps": "4068/6362"} +{"lm loss": 4.89019012, "grad_norm": 0.44525406, "learning_rate": 3.356e-05, "elapsed_time_per_iteration": 6.55629539, "memory(GiB)": 21.51, "elapsed_time": "7h 25m 51s", "remaining_time": "4h 11m 15s", "loss_scale": 1.0, "consumed_samples": 1041664, "global_step/max_steps": "4069/6362"} +{"lm loss": 4.89600563, "grad_norm": 0.44951889, "learning_rate": 3.354e-05, "elapsed_time_per_iteration": 6.67869997, "memory(GiB)": 21.51, "elapsed_time": "7h 25m 58s", "remaining_time": "4h 11m 8s", "loss_scale": 1.0, "consumed_samples": 1041920, "global_step/max_steps": "4070/6362"} +{"lm loss": 4.9292016, "grad_norm": 0.42531294, "learning_rate": 3.351e-05, "elapsed_time_per_iteration": 6.4881475, "memory(GiB)": 21.51, "elapsed_time": "7h 26m 5s", "remaining_time": "4h 11m 2s", "loss_scale": 1.0, "consumed_samples": 1042176, "global_step/max_steps": "4071/6362"} +{"lm loss": 4.91897202, "grad_norm": 0.44183776, "learning_rate": 3.349e-05, "elapsed_time_per_iteration": 6.58458591, "memory(GiB)": 21.51, "elapsed_time": "7h 26m 11s", "remaining_time": "4h 10m 55s", "loss_scale": 1.0, "consumed_samples": 1042432, "global_step/max_steps": "4072/6362"} +{"lm loss": 4.91614294, "grad_norm": 0.41571429, "learning_rate": 3.347e-05, "elapsed_time_per_iteration": 6.5038271, "memory(GiB)": 21.51, "elapsed_time": "7h 26m 18s", "remaining_time": "4h 10m 49s", "loss_scale": 1.0, "consumed_samples": 1042688, "global_step/max_steps": "4073/6362"} +{"lm loss": 4.88208723, "grad_norm": 0.40425098, "learning_rate": 3.344e-05, "elapsed_time_per_iteration": 6.44727516, "memory(GiB)": 21.51, "elapsed_time": "7h 26m 24s", "remaining_time": "4h 10m 42s", "loss_scale": 1.0, "consumed_samples": 1042944, "global_step/max_steps": "4074/6362"} +{"lm loss": 4.88363695, "grad_norm": 0.46971083, "learning_rate": 3.342e-05, "elapsed_time_per_iteration": 6.47922277, "memory(GiB)": 21.51, "elapsed_time": "7h 26m 31s", "remaining_time": "4h 10m 35s", "loss_scale": 1.0, "consumed_samples": 1043200, "global_step/max_steps": "4075/6362"} +{"lm loss": 4.87812996, "grad_norm": 0.45634687, "learning_rate": 3.34e-05, "elapsed_time_per_iteration": 6.73609567, "memory(GiB)": 21.51, "elapsed_time": "7h 26m 37s", "remaining_time": "4h 10m 29s", "loss_scale": 1.0, "consumed_samples": 1043456, "global_step/max_steps": "4076/6362"} +{"lm loss": 4.89190817, "grad_norm": 0.46433261, "learning_rate": 3.337e-05, "elapsed_time_per_iteration": 6.71467423, "memory(GiB)": 21.51, "elapsed_time": "7h 26m 44s", "remaining_time": "4h 10m 22s", "loss_scale": 1.0, "consumed_samples": 1043712, "global_step/max_steps": "4077/6362"} +{"lm loss": 4.8834691, "grad_norm": 0.46064058, "learning_rate": 3.335e-05, "elapsed_time_per_iteration": 6.68968606, "memory(GiB)": 21.51, "elapsed_time": "7h 26m 51s", "remaining_time": "4h 10m 16s", "loss_scale": 1.0, "consumed_samples": 1043968, "global_step/max_steps": "4078/6362"} +{"lm loss": 4.88058758, "grad_norm": 0.47274104, "learning_rate": 3.333e-05, "elapsed_time_per_iteration": 6.67307425, "memory(GiB)": 21.51, "elapsed_time": "7h 26m 57s", "remaining_time": "4h 10m 9s", "loss_scale": 1.0, "consumed_samples": 1044224, "global_step/max_steps": "4079/6362"} +{"lm loss": 4.91075468, "grad_norm": 0.44038084, "learning_rate": 3.33e-05, "elapsed_time_per_iteration": 6.46021032, "memory(GiB)": 21.51, "elapsed_time": "7h 27m 4s", "remaining_time": "4h 10m 3s", "loss_scale": 1.0, "consumed_samples": 1044480, "global_step/max_steps": "4080/6362"} +{"lm loss": 4.90824938, "grad_norm": 0.42259189, "learning_rate": 3.328e-05, "elapsed_time_per_iteration": 6.68613315, "memory(GiB)": 21.51, "elapsed_time": "7h 27m 11s", "remaining_time": "4h 9m 56s", "loss_scale": 1.0, "consumed_samples": 1044736, "global_step/max_steps": "4081/6362"} +{"lm loss": 4.88068867, "grad_norm": 0.41486317, "learning_rate": 3.326e-05, "elapsed_time_per_iteration": 6.61376095, "memory(GiB)": 21.51, "elapsed_time": "7h 27m 17s", "remaining_time": "4h 9m 50s", "loss_scale": 1.0, "consumed_samples": 1044992, "global_step/max_steps": "4082/6362"} +{"lm loss": 4.89506245, "grad_norm": 0.40102717, "learning_rate": 3.323e-05, "elapsed_time_per_iteration": 6.40848112, "memory(GiB)": 21.51, "elapsed_time": "7h 27m 24s", "remaining_time": "4h 9m 43s", "loss_scale": 1.0, "consumed_samples": 1045248, "global_step/max_steps": "4083/6362"} +{"lm loss": 4.89333677, "grad_norm": 0.45958725, "learning_rate": 3.321e-05, "elapsed_time_per_iteration": 7.4634068, "memory(GiB)": 21.51, "elapsed_time": "7h 27m 31s", "remaining_time": "4h 9m 37s", "loss_scale": 1.0, "consumed_samples": 1045504, "global_step/max_steps": "4084/6362"} +{"lm loss": 4.8898921, "grad_norm": 0.4174692, "learning_rate": 3.319e-05, "elapsed_time_per_iteration": 6.55927062, "memory(GiB)": 21.51, "elapsed_time": "7h 27m 38s", "remaining_time": "4h 9m 30s", "loss_scale": 1.0, "consumed_samples": 1045760, "global_step/max_steps": "4085/6362"} +{"lm loss": 4.88201618, "grad_norm": 0.43884417, "learning_rate": 3.316e-05, "elapsed_time_per_iteration": 6.55086398, "memory(GiB)": 21.51, "elapsed_time": "7h 27m 44s", "remaining_time": "4h 9m 24s", "loss_scale": 1.0, "consumed_samples": 1046016, "global_step/max_steps": "4086/6362"} +{"lm loss": 4.89856434, "grad_norm": 0.47760674, "learning_rate": 3.314e-05, "elapsed_time_per_iteration": 6.35865283, "memory(GiB)": 21.51, "elapsed_time": "7h 27m 50s", "remaining_time": "4h 9m 17s", "loss_scale": 1.0, "consumed_samples": 1046272, "global_step/max_steps": "4087/6362"} +{"lm loss": 4.90641165, "grad_norm": 0.43226203, "learning_rate": 3.312e-05, "elapsed_time_per_iteration": 6.54882407, "memory(GiB)": 21.51, "elapsed_time": "7h 27m 57s", "remaining_time": "4h 9m 10s", "loss_scale": 1.0, "consumed_samples": 1046528, "global_step/max_steps": "4088/6362"} +{"lm loss": 4.90331554, "grad_norm": 0.41538852, "learning_rate": 3.309e-05, "elapsed_time_per_iteration": 6.47962642, "memory(GiB)": 21.51, "elapsed_time": "7h 28m 4s", "remaining_time": "4h 9m 4s", "loss_scale": 1.0, "consumed_samples": 1046784, "global_step/max_steps": "4089/6362"} +{"lm loss": 4.90933752, "grad_norm": 0.42576873, "learning_rate": 3.307e-05, "elapsed_time_per_iteration": 6.65119171, "memory(GiB)": 21.51, "elapsed_time": "7h 28m 10s", "remaining_time": "4h 8m 57s", "loss_scale": 1.0, "consumed_samples": 1047040, "global_step/max_steps": "4090/6362"} +{"lm loss": 4.90032434, "grad_norm": 0.40660611, "learning_rate": 3.305e-05, "elapsed_time_per_iteration": 6.44090128, "memory(GiB)": 21.51, "elapsed_time": "7h 28m 17s", "remaining_time": "4h 8m 51s", "loss_scale": 1.0, "consumed_samples": 1047296, "global_step/max_steps": "4091/6362"} +{"lm loss": 4.89226723, "grad_norm": 0.43869588, "learning_rate": 3.302e-05, "elapsed_time_per_iteration": 6.51589179, "memory(GiB)": 21.51, "elapsed_time": "7h 28m 23s", "remaining_time": "4h 8m 44s", "loss_scale": 1.0, "consumed_samples": 1047552, "global_step/max_steps": "4092/6362"} +{"lm loss": 4.91511154, "grad_norm": 0.37049821, "learning_rate": 3.3e-05, "elapsed_time_per_iteration": 6.3812356, "memory(GiB)": 21.51, "elapsed_time": "7h 28m 30s", "remaining_time": "4h 8m 37s", "loss_scale": 1.0, "consumed_samples": 1047808, "global_step/max_steps": "4093/6362"} +{"lm loss": 4.88404942, "grad_norm": 0.42518815, "learning_rate": 3.298e-05, "elapsed_time_per_iteration": 6.6088891, "memory(GiB)": 21.51, "elapsed_time": "7h 28m 36s", "remaining_time": "4h 8m 31s", "loss_scale": 1.0, "consumed_samples": 1048064, "global_step/max_steps": "4094/6362"} +{"lm loss": 4.89843035, "grad_norm": 0.3648122, "learning_rate": 3.295e-05, "elapsed_time_per_iteration": 6.49644971, "memory(GiB)": 21.51, "elapsed_time": "7h 28m 43s", "remaining_time": "4h 8m 24s", "loss_scale": 1.0, "consumed_samples": 1048320, "global_step/max_steps": "4095/6362"} +{"lm loss": 4.90515709, "grad_norm": 0.44019562, "learning_rate": 3.293e-05, "elapsed_time_per_iteration": 6.56390429, "memory(GiB)": 21.51, "elapsed_time": "7h 28m 49s", "remaining_time": "4h 8m 18s", "loss_scale": 1.0, "consumed_samples": 1048576, "global_step/max_steps": "4096/6362"} +{"lm loss": 4.90207195, "grad_norm": 0.3947202, "learning_rate": 3.291e-05, "elapsed_time_per_iteration": 6.56477594, "memory(GiB)": 21.51, "elapsed_time": "7h 28m 56s", "remaining_time": "4h 8m 11s", "loss_scale": 1.0, "consumed_samples": 1048832, "global_step/max_steps": "4097/6362"} +{"lm loss": 4.89295244, "grad_norm": 0.44540897, "learning_rate": 3.288e-05, "elapsed_time_per_iteration": 6.65927577, "memory(GiB)": 21.51, "elapsed_time": "7h 29m 2s", "remaining_time": "4h 8m 4s", "loss_scale": 1.0, "consumed_samples": 1049088, "global_step/max_steps": "4098/6362"} +{"lm loss": 4.88574743, "grad_norm": 0.43527851, "learning_rate": 3.286e-05, "elapsed_time_per_iteration": 6.73860502, "memory(GiB)": 21.51, "elapsed_time": "7h 29m 9s", "remaining_time": "4h 7m 58s", "loss_scale": 1.0, "consumed_samples": 1049344, "global_step/max_steps": "4099/6362"} +{"lm loss": 4.88807917, "grad_norm": 0.46399459, "learning_rate": 3.284e-05, "elapsed_time_per_iteration": 6.5727911, "memory(GiB)": 21.51, "elapsed_time": "7h 29m 16s", "remaining_time": "4h 7m 51s", "loss_scale": 1.0, "consumed_samples": 1049600, "global_step/max_steps": "4100/6362"} +{"lm loss": 4.91434669, "grad_norm": 0.40370223, "learning_rate": 3.281e-05, "elapsed_time_per_iteration": 6.51771569, "memory(GiB)": 21.51, "elapsed_time": "7h 29m 22s", "remaining_time": "4h 7m 45s", "loss_scale": 1.0, "consumed_samples": 1049856, "global_step/max_steps": "4101/6362"} +{"lm loss": 4.88215351, "grad_norm": 0.5005672, "learning_rate": 3.279e-05, "elapsed_time_per_iteration": 7.27381897, "memory(GiB)": 21.51, "elapsed_time": "7h 29m 29s", "remaining_time": "4h 7m 39s", "loss_scale": 1.0, "consumed_samples": 1050112, "global_step/max_steps": "4102/6362"} +{"lm loss": 4.90807199, "grad_norm": 0.37115443, "learning_rate": 3.277e-05, "elapsed_time_per_iteration": 6.47330618, "memory(GiB)": 21.51, "elapsed_time": "7h 29m 36s", "remaining_time": "4h 7m 32s", "loss_scale": 1.0, "consumed_samples": 1050368, "global_step/max_steps": "4103/6362"} +{"lm loss": 4.9128828, "grad_norm": 0.46159604, "learning_rate": 3.274e-05, "elapsed_time_per_iteration": 6.50966024, "memory(GiB)": 21.51, "elapsed_time": "7h 29m 42s", "remaining_time": "4h 7m 25s", "loss_scale": 1.0, "consumed_samples": 1050624, "global_step/max_steps": "4104/6362"} +{"lm loss": 4.88236427, "grad_norm": 0.37680072, "learning_rate": 3.272e-05, "elapsed_time_per_iteration": 6.44126344, "memory(GiB)": 21.51, "elapsed_time": "7h 29m 49s", "remaining_time": "4h 7m 19s", "loss_scale": 1.0, "consumed_samples": 1050880, "global_step/max_steps": "4105/6362"} +{"lm loss": 4.87458897, "grad_norm": 0.43125001, "learning_rate": 3.27e-05, "elapsed_time_per_iteration": 6.66829181, "memory(GiB)": 21.51, "elapsed_time": "7h 29m 56s", "remaining_time": "4h 7m 12s", "loss_scale": 1.0, "consumed_samples": 1051136, "global_step/max_steps": "4106/6362"} +{"lm loss": 4.90961409, "grad_norm": 0.37986416, "learning_rate": 3.267e-05, "elapsed_time_per_iteration": 6.52204323, "memory(GiB)": 21.51, "elapsed_time": "7h 30m 2s", "remaining_time": "4h 7m 6s", "loss_scale": 1.0, "consumed_samples": 1051392, "global_step/max_steps": "4107/6362"} +{"lm loss": 4.8828187, "grad_norm": 0.40488222, "learning_rate": 3.265e-05, "elapsed_time_per_iteration": 6.6094861, "memory(GiB)": 21.51, "elapsed_time": "7h 30m 9s", "remaining_time": "4h 6m 59s", "loss_scale": 1.0, "consumed_samples": 1051648, "global_step/max_steps": "4108/6362"} +{"lm loss": 4.87741661, "grad_norm": 0.40368497, "learning_rate": 3.263e-05, "elapsed_time_per_iteration": 6.64775562, "memory(GiB)": 21.51, "elapsed_time": "7h 30m 15s", "remaining_time": "4h 6m 53s", "loss_scale": 1.0, "consumed_samples": 1051904, "global_step/max_steps": "4109/6362"} +{"lm loss": 4.90954018, "grad_norm": 0.39786416, "learning_rate": 3.26e-05, "elapsed_time_per_iteration": 6.90074825, "memory(GiB)": 21.51, "elapsed_time": "7h 30m 22s", "remaining_time": "4h 6m 46s", "loss_scale": 1.0, "consumed_samples": 1052160, "global_step/max_steps": "4110/6362"} +{"lm loss": 4.90415382, "grad_norm": 0.40139171, "learning_rate": 3.258e-05, "elapsed_time_per_iteration": 6.64552689, "memory(GiB)": 21.51, "elapsed_time": "7h 30m 29s", "remaining_time": "4h 6m 40s", "loss_scale": 1.0, "consumed_samples": 1052416, "global_step/max_steps": "4111/6362"} +{"lm loss": 4.91485119, "grad_norm": 0.37989387, "learning_rate": 3.256e-05, "elapsed_time_per_iteration": 6.77589321, "memory(GiB)": 21.51, "elapsed_time": "7h 30m 36s", "remaining_time": "4h 6m 33s", "loss_scale": 1.0, "consumed_samples": 1052672, "global_step/max_steps": "4112/6362"} +{"lm loss": 4.88948011, "grad_norm": 0.4105171, "learning_rate": 3.253e-05, "elapsed_time_per_iteration": 6.52471709, "memory(GiB)": 21.51, "elapsed_time": "7h 30m 42s", "remaining_time": "4h 6m 27s", "loss_scale": 1.0, "consumed_samples": 1052928, "global_step/max_steps": "4113/6362"} +{"lm loss": 4.86813593, "grad_norm": 0.40759894, "learning_rate": 3.251e-05, "elapsed_time_per_iteration": 6.3986814, "memory(GiB)": 21.51, "elapsed_time": "7h 30m 49s", "remaining_time": "4h 6m 20s", "loss_scale": 1.0, "consumed_samples": 1053184, "global_step/max_steps": "4114/6362"} +{"lm loss": 4.87851524, "grad_norm": 0.39738259, "learning_rate": 3.249e-05, "elapsed_time_per_iteration": 6.66449833, "memory(GiB)": 21.51, "elapsed_time": "7h 30m 55s", "remaining_time": "4h 6m 13s", "loss_scale": 1.0, "consumed_samples": 1053440, "global_step/max_steps": "4115/6362"} +{"lm loss": 4.88055658, "grad_norm": 0.41483712, "learning_rate": 3.246e-05, "elapsed_time_per_iteration": 6.45155454, "memory(GiB)": 21.51, "elapsed_time": "7h 31m 2s", "remaining_time": "4h 6m 7s", "loss_scale": 1.0, "consumed_samples": 1053696, "global_step/max_steps": "4116/6362"} +{"lm loss": 4.90105009, "grad_norm": 0.38821507, "learning_rate": 3.244e-05, "elapsed_time_per_iteration": 6.5917635, "memory(GiB)": 21.51, "elapsed_time": "7h 31m 8s", "remaining_time": "4h 6m 0s", "loss_scale": 1.0, "consumed_samples": 1053952, "global_step/max_steps": "4117/6362"} +{"lm loss": 4.90091467, "grad_norm": 0.39799893, "learning_rate": 3.242e-05, "elapsed_time_per_iteration": 6.614568, "memory(GiB)": 21.51, "elapsed_time": "7h 31m 15s", "remaining_time": "4h 5m 54s", "loss_scale": 1.0, "consumed_samples": 1054208, "global_step/max_steps": "4118/6362"} +{"lm loss": 4.87904072, "grad_norm": 0.37211478, "learning_rate": 3.239e-05, "elapsed_time_per_iteration": 6.57919979, "memory(GiB)": 21.51, "elapsed_time": "7h 31m 22s", "remaining_time": "4h 5m 47s", "loss_scale": 1.0, "consumed_samples": 1054464, "global_step/max_steps": "4119/6362"} +{"lm loss": 4.90506935, "grad_norm": 0.3948248, "learning_rate": 3.237e-05, "elapsed_time_per_iteration": 6.57559347, "memory(GiB)": 21.51, "elapsed_time": "7h 31m 28s", "remaining_time": "4h 5m 40s", "loss_scale": 1.0, "consumed_samples": 1054720, "global_step/max_steps": "4120/6362"} +{"lm loss": 4.91452789, "grad_norm": 0.40821201, "learning_rate": 3.235e-05, "elapsed_time_per_iteration": 6.56534314, "memory(GiB)": 21.51, "elapsed_time": "7h 31m 35s", "remaining_time": "4h 5m 34s", "loss_scale": 1.0, "consumed_samples": 1054976, "global_step/max_steps": "4121/6362"} +{"lm loss": 4.91154766, "grad_norm": 0.37575793, "learning_rate": 3.233e-05, "elapsed_time_per_iteration": 6.57218051, "memory(GiB)": 21.51, "elapsed_time": "7h 31m 41s", "remaining_time": "4h 5m 27s", "loss_scale": 1.0, "consumed_samples": 1055232, "global_step/max_steps": "4122/6362"} +{"lm loss": 4.89385605, "grad_norm": 0.43082315, "learning_rate": 3.23e-05, "elapsed_time_per_iteration": 6.57964683, "memory(GiB)": 21.51, "elapsed_time": "7h 31m 48s", "remaining_time": "4h 5m 21s", "loss_scale": 1.0, "consumed_samples": 1055488, "global_step/max_steps": "4123/6362"} +{"lm loss": 4.84616089, "grad_norm": 0.41582268, "learning_rate": 3.228e-05, "elapsed_time_per_iteration": 6.87444949, "memory(GiB)": 21.51, "elapsed_time": "7h 31m 55s", "remaining_time": "4h 5m 14s", "loss_scale": 1.0, "consumed_samples": 1055744, "global_step/max_steps": "4124/6362"} +{"lm loss": 4.88364363, "grad_norm": 0.43692654, "learning_rate": 3.226e-05, "elapsed_time_per_iteration": 6.50307178, "memory(GiB)": 21.51, "elapsed_time": "7h 32m 1s", "remaining_time": "4h 5m 8s", "loss_scale": 1.0, "consumed_samples": 1056000, "global_step/max_steps": "4125/6362"} +{"lm loss": 4.88561869, "grad_norm": 0.39101392, "learning_rate": 3.223e-05, "elapsed_time_per_iteration": 6.51915646, "memory(GiB)": 21.51, "elapsed_time": "7h 32m 8s", "remaining_time": "4h 5m 1s", "loss_scale": 1.0, "consumed_samples": 1056256, "global_step/max_steps": "4126/6362"} +{"lm loss": 4.8959012, "grad_norm": 0.42131525, "learning_rate": 3.221e-05, "elapsed_time_per_iteration": 6.64801288, "memory(GiB)": 21.51, "elapsed_time": "7h 32m 14s", "remaining_time": "4h 4m 55s", "loss_scale": 1.0, "consumed_samples": 1056512, "global_step/max_steps": "4127/6362"} +{"lm loss": 4.86342096, "grad_norm": 0.41035962, "learning_rate": 3.219e-05, "elapsed_time_per_iteration": 6.64480805, "memory(GiB)": 21.51, "elapsed_time": "7h 32m 21s", "remaining_time": "4h 4m 48s", "loss_scale": 1.0, "consumed_samples": 1056768, "global_step/max_steps": "4128/6362"} +{"lm loss": 4.88374472, "grad_norm": 0.42325327, "learning_rate": 3.216e-05, "elapsed_time_per_iteration": 6.49808955, "memory(GiB)": 21.51, "elapsed_time": "7h 32m 27s", "remaining_time": "4h 4m 41s", "loss_scale": 1.0, "consumed_samples": 1057024, "global_step/max_steps": "4129/6362"} +{"lm loss": 4.89141226, "grad_norm": 0.39974791, "learning_rate": 3.214e-05, "elapsed_time_per_iteration": 6.55468392, "memory(GiB)": 21.51, "elapsed_time": "7h 32m 34s", "remaining_time": "4h 4m 35s", "loss_scale": 1.0, "consumed_samples": 1057280, "global_step/max_steps": "4130/6362"} +{"lm loss": 4.86699581, "grad_norm": 0.42332065, "learning_rate": 3.212e-05, "elapsed_time_per_iteration": 6.51851177, "memory(GiB)": 21.51, "elapsed_time": "7h 32m 41s", "remaining_time": "4h 4m 28s", "loss_scale": 1.0, "consumed_samples": 1057536, "global_step/max_steps": "4131/6362"} +{"lm loss": 4.90093374, "grad_norm": 0.4358677, "learning_rate": 3.209e-05, "elapsed_time_per_iteration": 6.53259826, "memory(GiB)": 21.51, "elapsed_time": "7h 32m 47s", "remaining_time": "4h 4m 22s", "loss_scale": 1.0, "consumed_samples": 1057792, "global_step/max_steps": "4132/6362"} +{"lm loss": 4.90291548, "grad_norm": 0.41849926, "learning_rate": 3.207e-05, "elapsed_time_per_iteration": 6.75169539, "memory(GiB)": 21.51, "elapsed_time": "7h 32m 54s", "remaining_time": "4h 4m 15s", "loss_scale": 1.0, "consumed_samples": 1058048, "global_step/max_steps": "4133/6362"} +{"lm loss": 4.89338779, "grad_norm": 0.43482292, "learning_rate": 3.205e-05, "elapsed_time_per_iteration": 6.57347298, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 0s", "remaining_time": "4h 4m 9s", "loss_scale": 1.0, "consumed_samples": 1058304, "global_step/max_steps": "4134/6362"} +{"lm loss": 4.88853073, "grad_norm": 0.3975423, "learning_rate": 3.202e-05, "elapsed_time_per_iteration": 6.46495128, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 7s", "remaining_time": "4h 4m 2s", "loss_scale": 1.0, "consumed_samples": 1058560, "global_step/max_steps": "4135/6362"} +{"lm loss": 4.87896919, "grad_norm": 0.4534637, "learning_rate": 3.2e-05, "elapsed_time_per_iteration": 6.62531948, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 14s", "remaining_time": "4h 3m 55s", "loss_scale": 1.0, "consumed_samples": 1058816, "global_step/max_steps": "4136/6362"} +{"lm loss": 4.90386295, "grad_norm": 0.40964988, "learning_rate": 3.198e-05, "elapsed_time_per_iteration": 6.5166471, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 20s", "remaining_time": "4h 3m 49s", "loss_scale": 1.0, "consumed_samples": 1059072, "global_step/max_steps": "4137/6362"} +{"lm loss": 4.91669226, "grad_norm": 0.43688783, "learning_rate": 3.196e-05, "elapsed_time_per_iteration": 6.51368117, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 27s", "remaining_time": "4h 3m 42s", "loss_scale": 1.0, "consumed_samples": 1059328, "global_step/max_steps": "4138/6362"} +{"lm loss": 4.90666199, "grad_norm": 0.39295554, "learning_rate": 3.193e-05, "elapsed_time_per_iteration": 6.43412185, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 33s", "remaining_time": "4h 3m 35s", "loss_scale": 1.0, "consumed_samples": 1059584, "global_step/max_steps": "4139/6362"} +{"lm loss": 4.88601351, "grad_norm": 0.4474422, "learning_rate": 3.191e-05, "elapsed_time_per_iteration": 6.76580405, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 40s", "remaining_time": "4h 3m 29s", "loss_scale": 1.0, "consumed_samples": 1059840, "global_step/max_steps": "4140/6362"} +{"lm loss": 4.88904476, "grad_norm": 0.42793944, "learning_rate": 3.189e-05, "elapsed_time_per_iteration": 6.48708653, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 46s", "remaining_time": "4h 3m 22s", "loss_scale": 1.0, "consumed_samples": 1060096, "global_step/max_steps": "4141/6362"} +{"lm loss": 4.88676214, "grad_norm": 0.42420921, "learning_rate": 3.186e-05, "elapsed_time_per_iteration": 6.45670271, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 53s", "remaining_time": "4h 3m 16s", "loss_scale": 1.0, "consumed_samples": 1060352, "global_step/max_steps": "4142/6362"} +{"lm loss": 4.90191507, "grad_norm": 0.4602783, "learning_rate": 3.184e-05, "elapsed_time_per_iteration": 6.67332292, "memory(GiB)": 21.51, "elapsed_time": "7h 33m 59s", "remaining_time": "4h 3m 9s", "loss_scale": 1.0, "consumed_samples": 1060608, "global_step/max_steps": "4143/6362"} +{"lm loss": 4.90005255, "grad_norm": 0.46196797, "learning_rate": 3.182e-05, "elapsed_time_per_iteration": 6.72280169, "memory(GiB)": 21.51, "elapsed_time": "7h 34m 6s", "remaining_time": "4h 3m 3s", "loss_scale": 1.0, "consumed_samples": 1060864, "global_step/max_steps": "4144/6362"} +{"lm loss": 4.90656805, "grad_norm": 0.4494758, "learning_rate": 3.179e-05, "elapsed_time_per_iteration": 6.36637378, "memory(GiB)": 21.51, "elapsed_time": "7h 34m 12s", "remaining_time": "4h 2m 56s", "loss_scale": 1.0, "consumed_samples": 1061120, "global_step/max_steps": "4145/6362"} +{"lm loss": 4.9022007, "grad_norm": 0.51202625, "learning_rate": 3.177e-05, "elapsed_time_per_iteration": 6.29612923, "memory(GiB)": 21.51, "elapsed_time": "7h 34m 19s", "remaining_time": "4h 2m 49s", "loss_scale": 1.0, "consumed_samples": 1061376, "global_step/max_steps": "4146/6362"} +{"lm loss": 4.87819576, "grad_norm": 0.39917415, "learning_rate": 3.175e-05, "elapsed_time_per_iteration": 6.47514391, "memory(GiB)": 21.51, "elapsed_time": "7h 34m 25s", "remaining_time": "4h 2m 43s", "loss_scale": 1.0, "consumed_samples": 1061632, "global_step/max_steps": "4147/6362"} +{"lm loss": 4.88115311, "grad_norm": 0.43659237, "learning_rate": 3.172e-05, "elapsed_time_per_iteration": 6.53619576, "memory(GiB)": 21.51, "elapsed_time": "7h 34m 32s", "remaining_time": "4h 2m 36s", "loss_scale": 1.0, "consumed_samples": 1061888, "global_step/max_steps": "4148/6362"} +{"lm loss": 4.87696218, "grad_norm": 0.49768084, "learning_rate": 3.17e-05, "elapsed_time_per_iteration": 6.53029847, "memory(GiB)": 21.51, "elapsed_time": "7h 34m 38s", "remaining_time": "4h 2m 30s", "loss_scale": 1.0, "consumed_samples": 1062144, "global_step/max_steps": "4149/6362"} +{"lm loss": 4.89408588, "grad_norm": 0.44466555, "learning_rate": 3.168e-05, "elapsed_time_per_iteration": 6.56371737, "memory(GiB)": 21.51, "elapsed_time": "7h 34m 45s", "remaining_time": "4h 2m 23s", "loss_scale": 1.0, "consumed_samples": 1062400, "global_step/max_steps": "4150/6362"} +{"lm loss": 4.89697933, "grad_norm": 0.38896799, "learning_rate": 3.166e-05, "elapsed_time_per_iteration": 6.57372665, "memory(GiB)": 21.51, "elapsed_time": "7h 34m 51s", "remaining_time": "4h 2m 16s", "loss_scale": 1.0, "consumed_samples": 1062656, "global_step/max_steps": "4151/6362"} +{"lm loss": 4.91742134, "grad_norm": 0.4455497, "learning_rate": 3.163e-05, "elapsed_time_per_iteration": 6.73367047, "memory(GiB)": 21.51, "elapsed_time": "7h 34m 58s", "remaining_time": "4h 2m 10s", "loss_scale": 1.0, "consumed_samples": 1062912, "global_step/max_steps": "4152/6362"} +{"lm loss": 4.90925264, "grad_norm": 0.46200699, "learning_rate": 3.161e-05, "elapsed_time_per_iteration": 6.66904044, "memory(GiB)": 21.51, "elapsed_time": "7h 35m 5s", "remaining_time": "4h 2m 3s", "loss_scale": 1.0, "consumed_samples": 1063168, "global_step/max_steps": "4153/6362"} +{"lm loss": 4.90054893, "grad_norm": 0.41121086, "learning_rate": 3.159e-05, "elapsed_time_per_iteration": 6.45457172, "memory(GiB)": 21.51, "elapsed_time": "7h 35m 11s", "remaining_time": "4h 1m 57s", "loss_scale": 1.0, "consumed_samples": 1063424, "global_step/max_steps": "4154/6362"} +{"lm loss": 4.90488529, "grad_norm": 0.4206565, "learning_rate": 3.156e-05, "elapsed_time_per_iteration": 6.64752507, "memory(GiB)": 21.51, "elapsed_time": "7h 35m 18s", "remaining_time": "4h 1m 50s", "loss_scale": 1.0, "consumed_samples": 1063680, "global_step/max_steps": "4155/6362"} +{"lm loss": 4.90173244, "grad_norm": 0.40599036, "learning_rate": 3.154e-05, "elapsed_time_per_iteration": 6.66989517, "memory(GiB)": 21.51, "elapsed_time": "7h 35m 25s", "remaining_time": "4h 1m 44s", "loss_scale": 1.0, "consumed_samples": 1063936, "global_step/max_steps": "4156/6362"} +{"lm loss": 4.8857255, "grad_norm": 0.40755695, "learning_rate": 3.152e-05, "elapsed_time_per_iteration": 6.61265326, "memory(GiB)": 21.51, "elapsed_time": "7h 35m 31s", "remaining_time": "4h 1m 37s", "loss_scale": 1.0, "consumed_samples": 1064192, "global_step/max_steps": "4157/6362"} +{"lm loss": 4.89264536, "grad_norm": 0.41125894, "learning_rate": 3.15e-05, "elapsed_time_per_iteration": 6.62151718, "memory(GiB)": 21.51, "elapsed_time": "7h 35m 38s", "remaining_time": "4h 1m 31s", "loss_scale": 1.0, "consumed_samples": 1064448, "global_step/max_steps": "4158/6362"} +{"lm loss": 4.89456463, "grad_norm": 0.40426487, "learning_rate": 3.147e-05, "elapsed_time_per_iteration": 6.68218637, "memory(GiB)": 21.51, "elapsed_time": "7h 35m 45s", "remaining_time": "4h 1m 24s", "loss_scale": 1.0, "consumed_samples": 1064704, "global_step/max_steps": "4159/6362"} +{"lm loss": 4.90590096, "grad_norm": 0.38688353, "learning_rate": 3.145e-05, "elapsed_time_per_iteration": 6.49691939, "memory(GiB)": 21.51, "elapsed_time": "7h 35m 51s", "remaining_time": "4h 1m 17s", "loss_scale": 1.0, "consumed_samples": 1064960, "global_step/max_steps": "4160/6362"} +{"lm loss": 4.88565302, "grad_norm": 0.41221538, "learning_rate": 3.143e-05, "elapsed_time_per_iteration": 6.5891211, "memory(GiB)": 21.51, "elapsed_time": "7h 35m 58s", "remaining_time": "4h 1m 11s", "loss_scale": 1.0, "consumed_samples": 1065216, "global_step/max_steps": "4161/6362"} +{"lm loss": 4.88308191, "grad_norm": 0.38718683, "learning_rate": 3.14e-05, "elapsed_time_per_iteration": 6.66275692, "memory(GiB)": 21.51, "elapsed_time": "7h 36m 4s", "remaining_time": "4h 1m 4s", "loss_scale": 1.0, "consumed_samples": 1065472, "global_step/max_steps": "4162/6362"} +{"lm loss": 4.87905693, "grad_norm": 0.43074259, "learning_rate": 3.138e-05, "elapsed_time_per_iteration": 6.67276716, "memory(GiB)": 21.51, "elapsed_time": "7h 36m 11s", "remaining_time": "4h 0m 58s", "loss_scale": 1.0, "consumed_samples": 1065728, "global_step/max_steps": "4163/6362"} +{"lm loss": 4.88863039, "grad_norm": 0.44016105, "learning_rate": 3.136e-05, "elapsed_time_per_iteration": 6.48615837, "memory(GiB)": 21.51, "elapsed_time": "7h 36m 17s", "remaining_time": "4h 0m 51s", "loss_scale": 1.0, "consumed_samples": 1065984, "global_step/max_steps": "4164/6362"} +{"lm loss": 4.90529633, "grad_norm": 0.43861589, "learning_rate": 3.133e-05, "elapsed_time_per_iteration": 6.51718283, "memory(GiB)": 21.51, "elapsed_time": "7h 36m 24s", "remaining_time": "4h 0m 45s", "loss_scale": 1.0, "consumed_samples": 1066240, "global_step/max_steps": "4165/6362"} +{"lm loss": 4.89839458, "grad_norm": 0.3988643, "learning_rate": 3.131e-05, "elapsed_time_per_iteration": 6.5127511, "memory(GiB)": 21.51, "elapsed_time": "7h 36m 30s", "remaining_time": "4h 0m 38s", "loss_scale": 1.0, "consumed_samples": 1066496, "global_step/max_steps": "4166/6362"} +{"lm loss": 4.9007988, "grad_norm": 0.4246031, "learning_rate": 3.129e-05, "elapsed_time_per_iteration": 6.72096658, "memory(GiB)": 21.51, "elapsed_time": "7h 36m 37s", "remaining_time": "4h 0m 31s", "loss_scale": 1.0, "consumed_samples": 1066752, "global_step/max_steps": "4167/6362"} +{"lm loss": 4.91309547, "grad_norm": 0.4558771, "learning_rate": 3.127e-05, "elapsed_time_per_iteration": 6.55628753, "memory(GiB)": 21.51, "elapsed_time": "7h 36m 44s", "remaining_time": "4h 0m 25s", "loss_scale": 1.0, "consumed_samples": 1067008, "global_step/max_steps": "4168/6362"} +{"lm loss": 4.85882711, "grad_norm": 0.45193201, "learning_rate": 3.124e-05, "elapsed_time_per_iteration": 6.46483016, "memory(GiB)": 21.51, "elapsed_time": "7h 36m 50s", "remaining_time": "4h 0m 18s", "loss_scale": 1.0, "consumed_samples": 1067264, "global_step/max_steps": "4169/6362"} +{"lm loss": 4.89913082, "grad_norm": 0.3947975, "learning_rate": 3.122e-05, "elapsed_time_per_iteration": 6.52218533, "memory(GiB)": 21.51, "elapsed_time": "7h 36m 57s", "remaining_time": "4h 0m 12s", "loss_scale": 1.0, "consumed_samples": 1067520, "global_step/max_steps": "4170/6362"} +{"lm loss": 4.87798357, "grad_norm": 0.42531669, "learning_rate": 3.12e-05, "elapsed_time_per_iteration": 6.44130707, "memory(GiB)": 21.51, "elapsed_time": "7h 37m 3s", "remaining_time": "4h 0m 5s", "loss_scale": 1.0, "consumed_samples": 1067776, "global_step/max_steps": "4171/6362"} +{"lm loss": 4.89585638, "grad_norm": 0.41663569, "learning_rate": 3.117e-05, "elapsed_time_per_iteration": 6.42984986, "memory(GiB)": 21.51, "elapsed_time": "7h 37m 10s", "remaining_time": "3h 59m 58s", "loss_scale": 1.0, "consumed_samples": 1068032, "global_step/max_steps": "4172/6362"} +{"lm loss": 4.89407349, "grad_norm": 0.42513913, "learning_rate": 3.115e-05, "elapsed_time_per_iteration": 6.4616828, "memory(GiB)": 21.51, "elapsed_time": "7h 37m 16s", "remaining_time": "3h 59m 52s", "loss_scale": 1.0, "consumed_samples": 1068288, "global_step/max_steps": "4173/6362"} +{"lm loss": 4.89251232, "grad_norm": 0.37524599, "learning_rate": 3.113e-05, "elapsed_time_per_iteration": 6.43193579, "memory(GiB)": 21.51, "elapsed_time": "7h 37m 22s", "remaining_time": "3h 59m 45s", "loss_scale": 1.0, "consumed_samples": 1068544, "global_step/max_steps": "4174/6362"} +{"lm loss": 4.89108324, "grad_norm": 0.48511088, "learning_rate": 3.111e-05, "elapsed_time_per_iteration": 6.32655597, "memory(GiB)": 21.51, "elapsed_time": "7h 37m 29s", "remaining_time": "3h 59m 38s", "loss_scale": 1.0, "consumed_samples": 1068800, "global_step/max_steps": "4175/6362"} +{"lm loss": 4.89725065, "grad_norm": 0.47220609, "learning_rate": 3.108e-05, "elapsed_time_per_iteration": 6.57146144, "memory(GiB)": 21.51, "elapsed_time": "7h 37m 35s", "remaining_time": "3h 59m 32s", "loss_scale": 1.0, "consumed_samples": 1069056, "global_step/max_steps": "4176/6362"} +{"lm loss": 4.8821888, "grad_norm": 0.38816044, "learning_rate": 3.106e-05, "elapsed_time_per_iteration": 6.42414594, "memory(GiB)": 21.51, "elapsed_time": "7h 37m 42s", "remaining_time": "3h 59m 25s", "loss_scale": 1.0, "consumed_samples": 1069312, "global_step/max_steps": "4177/6362"} +{"lm loss": 4.89699411, "grad_norm": 0.41862789, "learning_rate": 3.104e-05, "elapsed_time_per_iteration": 6.46846175, "memory(GiB)": 21.51, "elapsed_time": "7h 37m 48s", "remaining_time": "3h 59m 18s", "loss_scale": 1.0, "consumed_samples": 1069568, "global_step/max_steps": "4178/6362"} +{"lm loss": 4.88468361, "grad_norm": 0.44248474, "learning_rate": 3.101e-05, "elapsed_time_per_iteration": 6.74036002, "memory(GiB)": 21.51, "elapsed_time": "7h 37m 55s", "remaining_time": "3h 59m 12s", "loss_scale": 1.0, "consumed_samples": 1069824, "global_step/max_steps": "4179/6362"} +{"lm loss": 4.89752626, "grad_norm": 0.44789606, "learning_rate": 3.099e-05, "elapsed_time_per_iteration": 6.64082694, "memory(GiB)": 21.51, "elapsed_time": "7h 38m 2s", "remaining_time": "3h 59m 5s", "loss_scale": 1.0, "consumed_samples": 1070080, "global_step/max_steps": "4180/6362"} +{"lm loss": 4.89072847, "grad_norm": 0.39556107, "learning_rate": 3.097e-05, "elapsed_time_per_iteration": 6.57333326, "memory(GiB)": 21.51, "elapsed_time": "7h 38m 8s", "remaining_time": "3h 58m 59s", "loss_scale": 1.0, "consumed_samples": 1070336, "global_step/max_steps": "4181/6362"} +{"lm loss": 4.8738904, "grad_norm": 0.49276578, "learning_rate": 3.095e-05, "elapsed_time_per_iteration": 6.91463828, "memory(GiB)": 21.51, "elapsed_time": "7h 38m 15s", "remaining_time": "3h 58m 52s", "loss_scale": 1.0, "consumed_samples": 1070592, "global_step/max_steps": "4182/6362"} +{"lm loss": 4.88956594, "grad_norm": 0.44022569, "learning_rate": 3.092e-05, "elapsed_time_per_iteration": 6.77749014, "memory(GiB)": 21.51, "elapsed_time": "7h 38m 22s", "remaining_time": "3h 58m 46s", "loss_scale": 1.0, "consumed_samples": 1070848, "global_step/max_steps": "4183/6362"} +{"lm loss": 4.88571548, "grad_norm": 0.42550427, "learning_rate": 3.09e-05, "elapsed_time_per_iteration": 6.43883562, "memory(GiB)": 21.51, "elapsed_time": "7h 38m 28s", "remaining_time": "3h 58m 39s", "loss_scale": 1.0, "consumed_samples": 1071104, "global_step/max_steps": "4184/6362"} +{"lm loss": 4.90565205, "grad_norm": 0.49915516, "learning_rate": 3.088e-05, "elapsed_time_per_iteration": 6.44456315, "memory(GiB)": 21.51, "elapsed_time": "7h 38m 35s", "remaining_time": "3h 58m 33s", "loss_scale": 1.0, "consumed_samples": 1071360, "global_step/max_steps": "4185/6362"} +{"lm loss": 4.90327835, "grad_norm": 0.46833494, "learning_rate": 3.085e-05, "elapsed_time_per_iteration": 6.55726218, "memory(GiB)": 21.51, "elapsed_time": "7h 38m 41s", "remaining_time": "3h 58m 26s", "loss_scale": 1.0, "consumed_samples": 1071616, "global_step/max_steps": "4186/6362"} +{"lm loss": 4.88728285, "grad_norm": 0.45416024, "learning_rate": 3.083e-05, "elapsed_time_per_iteration": 6.61652517, "memory(GiB)": 21.51, "elapsed_time": "7h 38m 48s", "remaining_time": "3h 58m 20s", "loss_scale": 1.0, "consumed_samples": 1071872, "global_step/max_steps": "4187/6362"} +{"lm loss": 4.90030146, "grad_norm": 0.44388327, "learning_rate": 3.081e-05, "elapsed_time_per_iteration": 6.49944162, "memory(GiB)": 21.51, "elapsed_time": "7h 38m 54s", "remaining_time": "3h 58m 13s", "loss_scale": 1.0, "consumed_samples": 1072128, "global_step/max_steps": "4188/6362"} +{"lm loss": 4.88875818, "grad_norm": 0.48692998, "learning_rate": 3.079e-05, "elapsed_time_per_iteration": 6.49618173, "memory(GiB)": 21.51, "elapsed_time": "7h 39m 1s", "remaining_time": "3h 58m 6s", "loss_scale": 1.0, "consumed_samples": 1072384, "global_step/max_steps": "4189/6362"} +{"lm loss": 4.90180016, "grad_norm": 0.43376216, "learning_rate": 3.076e-05, "elapsed_time_per_iteration": 6.63476706, "memory(GiB)": 21.51, "elapsed_time": "7h 39m 8s", "remaining_time": "3h 58m 0s", "loss_scale": 1.0, "consumed_samples": 1072640, "global_step/max_steps": "4190/6362"} +{"lm loss": 4.90896225, "grad_norm": 0.44789159, "learning_rate": 3.074e-05, "elapsed_time_per_iteration": 6.48851395, "memory(GiB)": 21.51, "elapsed_time": "7h 39m 14s", "remaining_time": "3h 57m 53s", "loss_scale": 1.0, "consumed_samples": 1072896, "global_step/max_steps": "4191/6362"} +{"lm loss": 4.90145588, "grad_norm": 0.51177251, "learning_rate": 3.072e-05, "elapsed_time_per_iteration": 6.69112659, "memory(GiB)": 21.51, "elapsed_time": "7h 39m 21s", "remaining_time": "3h 57m 47s", "loss_scale": 1.0, "consumed_samples": 1073152, "global_step/max_steps": "4192/6362"} +{"lm loss": 4.89851809, "grad_norm": 0.41277459, "learning_rate": 3.069e-05, "elapsed_time_per_iteration": 6.57838416, "memory(GiB)": 21.51, "elapsed_time": "7h 39m 27s", "remaining_time": "3h 57m 40s", "loss_scale": 1.0, "consumed_samples": 1073408, "global_step/max_steps": "4193/6362"} +{"lm loss": 4.88994074, "grad_norm": 0.47471571, "learning_rate": 3.067e-05, "elapsed_time_per_iteration": 6.62117958, "memory(GiB)": 21.51, "elapsed_time": "7h 39m 34s", "remaining_time": "3h 57m 34s", "loss_scale": 1.0, "consumed_samples": 1073664, "global_step/max_steps": "4194/6362"} +{"lm loss": 4.88579464, "grad_norm": 0.42756093, "learning_rate": 3.065e-05, "elapsed_time_per_iteration": 6.50498986, "memory(GiB)": 21.51, "elapsed_time": "7h 39m 40s", "remaining_time": "3h 57m 27s", "loss_scale": 1.0, "consumed_samples": 1073920, "global_step/max_steps": "4195/6362"} +{"lm loss": 4.91357088, "grad_norm": 0.43461093, "learning_rate": 3.063e-05, "elapsed_time_per_iteration": 6.44682837, "memory(GiB)": 21.51, "elapsed_time": "7h 39m 47s", "remaining_time": "3h 57m 20s", "loss_scale": 1.0, "consumed_samples": 1074176, "global_step/max_steps": "4196/6362"} +{"lm loss": 4.87557602, "grad_norm": 0.43510717, "learning_rate": 3.06e-05, "elapsed_time_per_iteration": 6.6665585, "memory(GiB)": 21.51, "elapsed_time": "7h 39m 54s", "remaining_time": "3h 57m 14s", "loss_scale": 1.0, "consumed_samples": 1074432, "global_step/max_steps": "4197/6362"} +{"lm loss": 4.9036026, "grad_norm": 0.42084685, "learning_rate": 3.058e-05, "elapsed_time_per_iteration": 6.3645575, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 0s", "remaining_time": "3h 57m 7s", "loss_scale": 1.0, "consumed_samples": 1074688, "global_step/max_steps": "4198/6362"} +{"lm loss": 4.89486361, "grad_norm": 0.41544777, "learning_rate": 3.056e-05, "elapsed_time_per_iteration": 6.38179898, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 6s", "remaining_time": "3h 57m 0s", "loss_scale": 1.0, "consumed_samples": 1074944, "global_step/max_steps": "4199/6362"} +{"lm loss": 4.89320469, "grad_norm": 0.44678032, "learning_rate": 3.054e-05, "elapsed_time_per_iteration": 6.50593853, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 13s", "remaining_time": "3h 56m 54s", "loss_scale": 1.0, "consumed_samples": 1075200, "global_step/max_steps": "4200/6362"} +{"lm loss": 4.92094421, "grad_norm": 0.37057927, "learning_rate": 3.051e-05, "elapsed_time_per_iteration": 6.29675841, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 19s", "remaining_time": "3h 56m 47s", "loss_scale": 1.0, "consumed_samples": 1075456, "global_step/max_steps": "4201/6362"} +{"lm loss": 4.89027071, "grad_norm": 0.4033094, "learning_rate": 3.049e-05, "elapsed_time_per_iteration": 6.37019992, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 26s", "remaining_time": "3h 56m 40s", "loss_scale": 1.0, "consumed_samples": 1075712, "global_step/max_steps": "4202/6362"} +{"lm loss": 4.90098763, "grad_norm": 0.38445494, "learning_rate": 3.047e-05, "elapsed_time_per_iteration": 6.52232671, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 32s", "remaining_time": "3h 56m 34s", "loss_scale": 1.0, "consumed_samples": 1075968, "global_step/max_steps": "4203/6362"} +{"lm loss": 4.87545872, "grad_norm": 0.39266005, "learning_rate": 3.044e-05, "elapsed_time_per_iteration": 6.47860193, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 39s", "remaining_time": "3h 56m 27s", "loss_scale": 1.0, "consumed_samples": 1076224, "global_step/max_steps": "4204/6362"} +{"lm loss": 4.89818048, "grad_norm": 0.43177319, "learning_rate": 3.042e-05, "elapsed_time_per_iteration": 6.39681625, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 45s", "remaining_time": "3h 56m 21s", "loss_scale": 1.0, "consumed_samples": 1076480, "global_step/max_steps": "4205/6362"} +{"lm loss": 4.87858915, "grad_norm": 0.37026215, "learning_rate": 3.04e-05, "elapsed_time_per_iteration": 6.48208594, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 51s", "remaining_time": "3h 56m 14s", "loss_scale": 1.0, "consumed_samples": 1076736, "global_step/max_steps": "4206/6362"} +{"lm loss": 4.8933363, "grad_norm": 0.38447011, "learning_rate": 3.038e-05, "elapsed_time_per_iteration": 6.49553823, "memory(GiB)": 21.51, "elapsed_time": "7h 40m 58s", "remaining_time": "3h 56m 7s", "loss_scale": 1.0, "consumed_samples": 1076992, "global_step/max_steps": "4207/6362"} +{"lm loss": 4.87946415, "grad_norm": 0.39794692, "learning_rate": 3.035e-05, "elapsed_time_per_iteration": 6.62328458, "memory(GiB)": 21.51, "elapsed_time": "7h 41m 5s", "remaining_time": "3h 56m 1s", "loss_scale": 1.0, "consumed_samples": 1077248, "global_step/max_steps": "4208/6362"} +{"lm loss": 4.89714527, "grad_norm": 0.3992992, "learning_rate": 3.033e-05, "elapsed_time_per_iteration": 6.36788225, "memory(GiB)": 21.51, "elapsed_time": "7h 41m 11s", "remaining_time": "3h 55m 54s", "loss_scale": 1.0, "consumed_samples": 1077504, "global_step/max_steps": "4209/6362"} +{"lm loss": 4.89573956, "grad_norm": 0.41219804, "learning_rate": 3.031e-05, "elapsed_time_per_iteration": 6.88703561, "memory(GiB)": 21.51, "elapsed_time": "7h 41m 18s", "remaining_time": "3h 55m 48s", "loss_scale": 1.0, "consumed_samples": 1077760, "global_step/max_steps": "4210/6362"} +{"lm loss": 4.88406277, "grad_norm": 0.38725224, "learning_rate": 3.029e-05, "elapsed_time_per_iteration": 6.55405092, "memory(GiB)": 21.51, "elapsed_time": "7h 41m 24s", "remaining_time": "3h 55m 41s", "loss_scale": 1.0, "consumed_samples": 1078016, "global_step/max_steps": "4211/6362"} +{"lm loss": 4.89873457, "grad_norm": 0.41296443, "learning_rate": 3.026e-05, "elapsed_time_per_iteration": 6.57566667, "memory(GiB)": 21.51, "elapsed_time": "7h 41m 31s", "remaining_time": "3h 55m 34s", "loss_scale": 1.0, "consumed_samples": 1078272, "global_step/max_steps": "4212/6362"} +{"lm loss": 4.87716484, "grad_norm": 0.3935433, "learning_rate": 3.024e-05, "elapsed_time_per_iteration": 6.56824374, "memory(GiB)": 21.51, "elapsed_time": "7h 41m 37s", "remaining_time": "3h 55m 28s", "loss_scale": 1.0, "consumed_samples": 1078528, "global_step/max_steps": "4213/6362"} +{"lm loss": 4.90602636, "grad_norm": 0.41385475, "learning_rate": 3.022e-05, "elapsed_time_per_iteration": 6.47848105, "memory(GiB)": 21.51, "elapsed_time": "7h 41m 44s", "remaining_time": "3h 55m 21s", "loss_scale": 1.0, "consumed_samples": 1078784, "global_step/max_steps": "4214/6362"} +{"lm loss": 4.90689182, "grad_norm": 0.40157887, "learning_rate": 3.02e-05, "elapsed_time_per_iteration": 6.6657474, "memory(GiB)": 21.51, "elapsed_time": "7h 41m 51s", "remaining_time": "3h 55m 15s", "loss_scale": 1.0, "consumed_samples": 1079040, "global_step/max_steps": "4215/6362"} +{"lm loss": 4.8749361, "grad_norm": 0.42818004, "learning_rate": 3.017e-05, "elapsed_time_per_iteration": 6.75853729, "memory(GiB)": 21.51, "elapsed_time": "7h 41m 57s", "remaining_time": "3h 55m 8s", "loss_scale": 1.0, "consumed_samples": 1079296, "global_step/max_steps": "4216/6362"} +{"lm loss": 4.89227724, "grad_norm": 0.38404778, "learning_rate": 3.015e-05, "elapsed_time_per_iteration": 6.55152869, "memory(GiB)": 21.51, "elapsed_time": "7h 42m 4s", "remaining_time": "3h 55m 2s", "loss_scale": 1.0, "consumed_samples": 1079552, "global_step/max_steps": "4217/6362"} +{"lm loss": 4.89327717, "grad_norm": 0.3837916, "learning_rate": 3.013e-05, "elapsed_time_per_iteration": 6.8639431, "memory(GiB)": 21.51, "elapsed_time": "7h 42m 11s", "remaining_time": "3h 54m 55s", "loss_scale": 1.0, "consumed_samples": 1079808, "global_step/max_steps": "4218/6362"} +{"lm loss": 4.89439058, "grad_norm": 0.38015017, "learning_rate": 3.01e-05, "elapsed_time_per_iteration": 6.84970307, "memory(GiB)": 21.51, "elapsed_time": "7h 42m 18s", "remaining_time": "3h 54m 49s", "loss_scale": 1.0, "consumed_samples": 1080064, "global_step/max_steps": "4219/6362"} +{"lm loss": 4.90943289, "grad_norm": 0.41187671, "learning_rate": 3.008e-05, "elapsed_time_per_iteration": 6.65614724, "memory(GiB)": 21.51, "elapsed_time": "7h 42m 24s", "remaining_time": "3h 54m 42s", "loss_scale": 1.0, "consumed_samples": 1080320, "global_step/max_steps": "4220/6362"} +{"lm loss": 4.89397764, "grad_norm": 0.41926134, "learning_rate": 3.006e-05, "elapsed_time_per_iteration": 6.60935068, "memory(GiB)": 21.51, "elapsed_time": "7h 42m 31s", "remaining_time": "3h 54m 36s", "loss_scale": 1.0, "consumed_samples": 1080576, "global_step/max_steps": "4221/6362"} +{"lm loss": 4.88507271, "grad_norm": 0.3902837, "learning_rate": 3.004e-05, "elapsed_time_per_iteration": 6.54570413, "memory(GiB)": 21.51, "elapsed_time": "7h 42m 37s", "remaining_time": "3h 54m 29s", "loss_scale": 1.0, "consumed_samples": 1080832, "global_step/max_steps": "4222/6362"} +{"lm loss": 4.9043498, "grad_norm": 0.35384169, "learning_rate": 3.001e-05, "elapsed_time_per_iteration": 6.62309504, "memory(GiB)": 21.51, "elapsed_time": "7h 42m 44s", "remaining_time": "3h 54m 23s", "loss_scale": 1.0, "consumed_samples": 1081088, "global_step/max_steps": "4223/6362"} +{"lm loss": 4.87936068, "grad_norm": 0.44754773, "learning_rate": 2.999e-05, "elapsed_time_per_iteration": 6.6756084, "memory(GiB)": 21.51, "elapsed_time": "7h 42m 51s", "remaining_time": "3h 54m 16s", "loss_scale": 1.0, "consumed_samples": 1081344, "global_step/max_steps": "4224/6362"} +{"lm loss": 4.92393684, "grad_norm": 0.40055925, "learning_rate": 2.997e-05, "elapsed_time_per_iteration": 6.36806369, "memory(GiB)": 21.51, "elapsed_time": "7h 42m 57s", "remaining_time": "3h 54m 9s", "loss_scale": 1.0, "consumed_samples": 1081600, "global_step/max_steps": "4225/6362"} +{"lm loss": 4.88607407, "grad_norm": 0.40736699, "learning_rate": 2.995e-05, "elapsed_time_per_iteration": 6.7095108, "memory(GiB)": 21.51, "elapsed_time": "7h 43m 4s", "remaining_time": "3h 54m 3s", "loss_scale": 1.0, "consumed_samples": 1081856, "global_step/max_steps": "4226/6362"} +{"lm loss": 4.89596128, "grad_norm": 0.42365336, "learning_rate": 2.992e-05, "elapsed_time_per_iteration": 6.75291991, "memory(GiB)": 21.51, "elapsed_time": "7h 43m 11s", "remaining_time": "3h 53m 56s", "loss_scale": 1.0, "consumed_samples": 1082112, "global_step/max_steps": "4227/6362"} +{"lm loss": 4.91527796, "grad_norm": 0.37788945, "learning_rate": 2.99e-05, "elapsed_time_per_iteration": 6.74504232, "memory(GiB)": 21.51, "elapsed_time": "7h 43m 17s", "remaining_time": "3h 53m 50s", "loss_scale": 1.0, "consumed_samples": 1082368, "global_step/max_steps": "4228/6362"} +{"lm loss": 4.90974855, "grad_norm": 0.45939538, "learning_rate": 2.988e-05, "elapsed_time_per_iteration": 6.66709089, "memory(GiB)": 21.51, "elapsed_time": "7h 43m 24s", "remaining_time": "3h 53m 43s", "loss_scale": 1.0, "consumed_samples": 1082624, "global_step/max_steps": "4229/6362"} +{"lm loss": 4.90561056, "grad_norm": 0.44427517, "learning_rate": 2.986e-05, "elapsed_time_per_iteration": 6.32736158, "memory(GiB)": 21.51, "elapsed_time": "7h 43m 30s", "remaining_time": "3h 53m 37s", "loss_scale": 1.0, "consumed_samples": 1082880, "global_step/max_steps": "4230/6362"} +{"lm loss": 4.91247845, "grad_norm": 0.39708084, "learning_rate": 2.983e-05, "elapsed_time_per_iteration": 6.46853971, "memory(GiB)": 21.51, "elapsed_time": "7h 43m 37s", "remaining_time": "3h 53m 30s", "loss_scale": 1.0, "consumed_samples": 1083136, "global_step/max_steps": "4231/6362"} +{"lm loss": 4.9032588, "grad_norm": 0.42217079, "learning_rate": 2.981e-05, "elapsed_time_per_iteration": 6.59826851, "memory(GiB)": 21.51, "elapsed_time": "7h 43m 43s", "remaining_time": "3h 53m 23s", "loss_scale": 1.0, "consumed_samples": 1083392, "global_step/max_steps": "4232/6362"} +{"lm loss": 4.90840387, "grad_norm": 0.39610729, "learning_rate": 2.979e-05, "elapsed_time_per_iteration": 6.29815984, "memory(GiB)": 21.51, "elapsed_time": "7h 43m 50s", "remaining_time": "3h 53m 17s", "loss_scale": 1.0, "consumed_samples": 1083648, "global_step/max_steps": "4233/6362"} +{"lm loss": 4.90466166, "grad_norm": 0.45065328, "learning_rate": 2.977e-05, "elapsed_time_per_iteration": 6.52766585, "memory(GiB)": 21.51, "elapsed_time": "7h 43m 56s", "remaining_time": "3h 53m 10s", "loss_scale": 1.0, "consumed_samples": 1083904, "global_step/max_steps": "4234/6362"} +{"lm loss": 4.87960529, "grad_norm": 0.39142901, "learning_rate": 2.974e-05, "elapsed_time_per_iteration": 6.31947207, "memory(GiB)": 21.51, "elapsed_time": "7h 44m 3s", "remaining_time": "3h 53m 3s", "loss_scale": 1.0, "consumed_samples": 1084160, "global_step/max_steps": "4235/6362"} +{"lm loss": 4.89072037, "grad_norm": 0.41181245, "learning_rate": 2.972e-05, "elapsed_time_per_iteration": 6.40832949, "memory(GiB)": 21.51, "elapsed_time": "7h 44m 9s", "remaining_time": "3h 52m 57s", "loss_scale": 1.0, "consumed_samples": 1084416, "global_step/max_steps": "4236/6362"} +{"lm loss": 4.89572954, "grad_norm": 0.40901226, "learning_rate": 2.97e-05, "elapsed_time_per_iteration": 6.36106372, "memory(GiB)": 21.51, "elapsed_time": "7h 44m 15s", "remaining_time": "3h 52m 50s", "loss_scale": 1.0, "consumed_samples": 1084672, "global_step/max_steps": "4237/6362"} +{"lm loss": 4.87893629, "grad_norm": 0.42684644, "learning_rate": 2.968e-05, "elapsed_time_per_iteration": 6.50052691, "memory(GiB)": 21.51, "elapsed_time": "7h 44m 22s", "remaining_time": "3h 52m 44s", "loss_scale": 1.0, "consumed_samples": 1084928, "global_step/max_steps": "4238/6362"} +{"lm loss": 4.90894222, "grad_norm": 0.44266519, "learning_rate": 2.965e-05, "elapsed_time_per_iteration": 6.51130676, "memory(GiB)": 21.51, "elapsed_time": "7h 44m 28s", "remaining_time": "3h 52m 37s", "loss_scale": 1.0, "consumed_samples": 1085184, "global_step/max_steps": "4239/6362"} +{"lm loss": 4.87485504, "grad_norm": 0.46356168, "learning_rate": 2.963e-05, "elapsed_time_per_iteration": 6.48974061, "memory(GiB)": 21.51, "elapsed_time": "7h 44m 35s", "remaining_time": "3h 52m 30s", "loss_scale": 1.0, "consumed_samples": 1085440, "global_step/max_steps": "4240/6362"} +{"lm loss": 4.8881793, "grad_norm": 0.39726734, "learning_rate": 2.961e-05, "elapsed_time_per_iteration": 6.32612276, "memory(GiB)": 21.51, "elapsed_time": "7h 44m 41s", "remaining_time": "3h 52m 24s", "loss_scale": 1.0, "consumed_samples": 1085696, "global_step/max_steps": "4241/6362"} +{"lm loss": 4.90111923, "grad_norm": 0.43747976, "learning_rate": 2.959e-05, "elapsed_time_per_iteration": 6.34017229, "memory(GiB)": 21.51, "elapsed_time": "7h 44m 47s", "remaining_time": "3h 52m 17s", "loss_scale": 1.0, "consumed_samples": 1085952, "global_step/max_steps": "4242/6362"} +{"lm loss": 4.89820576, "grad_norm": 0.42437261, "learning_rate": 2.956e-05, "elapsed_time_per_iteration": 6.60409904, "memory(GiB)": 21.51, "elapsed_time": "7h 44m 54s", "remaining_time": "3h 52m 10s", "loss_scale": 1.0, "consumed_samples": 1086208, "global_step/max_steps": "4243/6362"} +{"lm loss": 4.90240049, "grad_norm": 0.45606551, "learning_rate": 2.954e-05, "elapsed_time_per_iteration": 6.51738238, "memory(GiB)": 21.51, "elapsed_time": "7h 45m 1s", "remaining_time": "3h 52m 4s", "loss_scale": 1.0, "consumed_samples": 1086464, "global_step/max_steps": "4244/6362"} +{"lm loss": 4.8862977, "grad_norm": 0.43242022, "learning_rate": 2.952e-05, "elapsed_time_per_iteration": 6.53972745, "memory(GiB)": 21.51, "elapsed_time": "7h 45m 7s", "remaining_time": "3h 51m 57s", "loss_scale": 1.0, "consumed_samples": 1086720, "global_step/max_steps": "4245/6362"} +{"lm loss": 4.8938942, "grad_norm": 0.45530683, "learning_rate": 2.95e-05, "elapsed_time_per_iteration": 6.48869824, "memory(GiB)": 21.51, "elapsed_time": "7h 45m 14s", "remaining_time": "3h 51m 51s", "loss_scale": 1.0, "consumed_samples": 1086976, "global_step/max_steps": "4246/6362"} +{"lm loss": 4.87315083, "grad_norm": 0.449862, "learning_rate": 2.947e-05, "elapsed_time_per_iteration": 6.75081515, "memory(GiB)": 21.51, "elapsed_time": "7h 45m 20s", "remaining_time": "3h 51m 44s", "loss_scale": 1.0, "consumed_samples": 1087232, "global_step/max_steps": "4247/6362"} +{"lm loss": 4.8924017, "grad_norm": 0.37647268, "learning_rate": 2.945e-05, "elapsed_time_per_iteration": 6.48526812, "memory(GiB)": 21.51, "elapsed_time": "7h 45m 27s", "remaining_time": "3h 51m 37s", "loss_scale": 1.0, "consumed_samples": 1087488, "global_step/max_steps": "4248/6362"} +{"lm loss": 4.86628675, "grad_norm": 0.43391919, "learning_rate": 2.943e-05, "elapsed_time_per_iteration": 6.81149435, "memory(GiB)": 21.51, "elapsed_time": "7h 45m 34s", "remaining_time": "3h 51m 31s", "loss_scale": 1.0, "consumed_samples": 1087744, "global_step/max_steps": "4249/6362"} +{"lm loss": 4.89664221, "grad_norm": 0.41676727, "learning_rate": 2.941e-05, "elapsed_time_per_iteration": 6.55931115, "memory(GiB)": 21.51, "elapsed_time": "7h 45m 40s", "remaining_time": "3h 51m 24s", "loss_scale": 1.0, "consumed_samples": 1088000, "global_step/max_steps": "4250/6362"} +{"lm loss": 4.8741169, "grad_norm": 0.40680602, "learning_rate": 2.938e-05, "elapsed_time_per_iteration": 6.79312062, "memory(GiB)": 21.51, "elapsed_time": "7h 45m 47s", "remaining_time": "3h 51m 18s", "loss_scale": 1.0, "consumed_samples": 1088256, "global_step/max_steps": "4251/6362"} +{"lm loss": 4.89335537, "grad_norm": 0.47620472, "learning_rate": 2.936e-05, "elapsed_time_per_iteration": 6.55315614, "memory(GiB)": 21.51, "elapsed_time": "7h 45m 54s", "remaining_time": "3h 51m 11s", "loss_scale": 1.0, "consumed_samples": 1088512, "global_step/max_steps": "4252/6362"} +{"lm loss": 4.89707661, "grad_norm": 0.46527913, "learning_rate": 2.934e-05, "elapsed_time_per_iteration": 6.36008763, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 0s", "remaining_time": "3h 51m 5s", "loss_scale": 1.0, "consumed_samples": 1088768, "global_step/max_steps": "4253/6362"} +{"lm loss": 4.85461092, "grad_norm": 0.38872507, "learning_rate": 2.932e-05, "elapsed_time_per_iteration": 6.63366961, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 7s", "remaining_time": "3h 50m 58s", "loss_scale": 1.0, "consumed_samples": 1089024, "global_step/max_steps": "4254/6362"} +{"lm loss": 4.88402081, "grad_norm": 0.42805099, "learning_rate": 2.929e-05, "elapsed_time_per_iteration": 6.48728561, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 13s", "remaining_time": "3h 50m 52s", "loss_scale": 1.0, "consumed_samples": 1089280, "global_step/max_steps": "4255/6362"} +{"lm loss": 4.89186811, "grad_norm": 0.39335626, "learning_rate": 2.927e-05, "elapsed_time_per_iteration": 6.36312079, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 19s", "remaining_time": "3h 50m 45s", "loss_scale": 1.0, "consumed_samples": 1089536, "global_step/max_steps": "4256/6362"} +{"lm loss": 4.90739679, "grad_norm": 0.40902305, "learning_rate": 2.925e-05, "elapsed_time_per_iteration": 6.59585452, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 26s", "remaining_time": "3h 50m 38s", "loss_scale": 1.0, "consumed_samples": 1089792, "global_step/max_steps": "4257/6362"} +{"lm loss": 4.90009689, "grad_norm": 0.42579484, "learning_rate": 2.923e-05, "elapsed_time_per_iteration": 6.47646046, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 32s", "remaining_time": "3h 50m 32s", "loss_scale": 1.0, "consumed_samples": 1090048, "global_step/max_steps": "4258/6362"} +{"lm loss": 4.90505552, "grad_norm": 0.38421547, "learning_rate": 2.92e-05, "elapsed_time_per_iteration": 6.61667085, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 39s", "remaining_time": "3h 50m 25s", "loss_scale": 1.0, "consumed_samples": 1090304, "global_step/max_steps": "4259/6362"} +{"lm loss": 4.88989019, "grad_norm": 0.39865533, "learning_rate": 2.918e-05, "elapsed_time_per_iteration": 6.62951636, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 46s", "remaining_time": "3h 50m 19s", "loss_scale": 1.0, "consumed_samples": 1090560, "global_step/max_steps": "4260/6362"} +{"lm loss": 4.88310337, "grad_norm": 0.44297388, "learning_rate": 2.916e-05, "elapsed_time_per_iteration": 6.37366629, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 52s", "remaining_time": "3h 50m 12s", "loss_scale": 1.0, "consumed_samples": 1090816, "global_step/max_steps": "4261/6362"} +{"lm loss": 4.90375423, "grad_norm": 0.35745436, "learning_rate": 2.914e-05, "elapsed_time_per_iteration": 6.54841185, "memory(GiB)": 21.51, "elapsed_time": "7h 46m 59s", "remaining_time": "3h 50m 5s", "loss_scale": 1.0, "consumed_samples": 1091072, "global_step/max_steps": "4262/6362"} +{"lm loss": 4.92182255, "grad_norm": 0.42146602, "learning_rate": 2.911e-05, "elapsed_time_per_iteration": 6.39330721, "memory(GiB)": 21.51, "elapsed_time": "7h 47m 5s", "remaining_time": "3h 49m 59s", "loss_scale": 1.0, "consumed_samples": 1091328, "global_step/max_steps": "4263/6362"} +{"lm loss": 4.89639091, "grad_norm": 0.39148083, "learning_rate": 2.909e-05, "elapsed_time_per_iteration": 6.44701171, "memory(GiB)": 21.51, "elapsed_time": "7h 47m 12s", "remaining_time": "3h 49m 52s", "loss_scale": 1.0, "consumed_samples": 1091584, "global_step/max_steps": "4264/6362"} +{"lm loss": 4.90614367, "grad_norm": 0.42390546, "learning_rate": 2.907e-05, "elapsed_time_per_iteration": 6.53078008, "memory(GiB)": 21.51, "elapsed_time": "7h 47m 18s", "remaining_time": "3h 49m 45s", "loss_scale": 1.0, "consumed_samples": 1091840, "global_step/max_steps": "4265/6362"} +{"lm loss": 4.8905139, "grad_norm": 0.37553638, "learning_rate": 2.905e-05, "elapsed_time_per_iteration": 6.40708232, "memory(GiB)": 21.51, "elapsed_time": "7h 47m 24s", "remaining_time": "3h 49m 39s", "loss_scale": 1.0, "consumed_samples": 1092096, "global_step/max_steps": "4266/6362"} +{"lm loss": 4.89497042, "grad_norm": 0.41164878, "learning_rate": 2.903e-05, "elapsed_time_per_iteration": 6.71808577, "memory(GiB)": 21.51, "elapsed_time": "7h 47m 31s", "remaining_time": "3h 49m 32s", "loss_scale": 1.0, "consumed_samples": 1092352, "global_step/max_steps": "4267/6362"} +{"lm loss": 4.88778257, "grad_norm": 0.3898136, "learning_rate": 2.9e-05, "elapsed_time_per_iteration": 6.46603274, "memory(GiB)": 21.51, "elapsed_time": "7h 47m 38s", "remaining_time": "3h 49m 26s", "loss_scale": 1.0, "consumed_samples": 1092608, "global_step/max_steps": "4268/6362"} +{"lm loss": 4.89280224, "grad_norm": 0.39007255, "learning_rate": 2.898e-05, "elapsed_time_per_iteration": 6.55009246, "memory(GiB)": 21.51, "elapsed_time": "7h 47m 44s", "remaining_time": "3h 49m 19s", "loss_scale": 1.0, "consumed_samples": 1092864, "global_step/max_steps": "4269/6362"} +{"lm loss": 4.88592386, "grad_norm": 0.40286186, "learning_rate": 2.896e-05, "elapsed_time_per_iteration": 6.45431042, "memory(GiB)": 21.51, "elapsed_time": "7h 47m 51s", "remaining_time": "3h 49m 12s", "loss_scale": 1.0, "consumed_samples": 1093120, "global_step/max_steps": "4270/6362"} +{"lm loss": 4.89432049, "grad_norm": 0.40122744, "learning_rate": 2.894e-05, "elapsed_time_per_iteration": 6.72164321, "memory(GiB)": 21.51, "elapsed_time": "7h 47m 57s", "remaining_time": "3h 49m 6s", "loss_scale": 1.0, "consumed_samples": 1093376, "global_step/max_steps": "4271/6362"} +{"lm loss": 4.90209723, "grad_norm": 0.40724933, "learning_rate": 2.891e-05, "elapsed_time_per_iteration": 6.55233836, "memory(GiB)": 21.51, "elapsed_time": "7h 48m 4s", "remaining_time": "3h 48m 59s", "loss_scale": 1.0, "consumed_samples": 1093632, "global_step/max_steps": "4272/6362"} +{"lm loss": 4.88924265, "grad_norm": 0.38758594, "learning_rate": 2.889e-05, "elapsed_time_per_iteration": 6.7898612, "memory(GiB)": 21.51, "elapsed_time": "7h 48m 11s", "remaining_time": "3h 48m 53s", "loss_scale": 1.0, "consumed_samples": 1093888, "global_step/max_steps": "4273/6362"} +{"lm loss": 4.89429808, "grad_norm": 0.37458718, "learning_rate": 2.887e-05, "elapsed_time_per_iteration": 6.54511023, "memory(GiB)": 21.51, "elapsed_time": "7h 48m 17s", "remaining_time": "3h 48m 46s", "loss_scale": 1.0, "consumed_samples": 1094144, "global_step/max_steps": "4274/6362"} +{"lm loss": 4.8786993, "grad_norm": 0.38642123, "learning_rate": 2.885e-05, "elapsed_time_per_iteration": 6.45749569, "memory(GiB)": 21.51, "elapsed_time": "7h 48m 24s", "remaining_time": "3h 48m 40s", "loss_scale": 1.0, "consumed_samples": 1094400, "global_step/max_steps": "4275/6362"} +{"lm loss": 4.89578485, "grad_norm": 0.37587994, "learning_rate": 2.882e-05, "elapsed_time_per_iteration": 6.63466287, "memory(GiB)": 21.51, "elapsed_time": "7h 48m 30s", "remaining_time": "3h 48m 33s", "loss_scale": 1.0, "consumed_samples": 1094656, "global_step/max_steps": "4276/6362"} +{"lm loss": 4.88323069, "grad_norm": 0.3925496, "learning_rate": 2.88e-05, "elapsed_time_per_iteration": 6.56435394, "memory(GiB)": 21.51, "elapsed_time": "7h 48m 37s", "remaining_time": "3h 48m 26s", "loss_scale": 1.0, "consumed_samples": 1094912, "global_step/max_steps": "4277/6362"} +{"lm loss": 4.8878355, "grad_norm": 0.37376794, "learning_rate": 2.878e-05, "elapsed_time_per_iteration": 6.38896894, "memory(GiB)": 21.51, "elapsed_time": "7h 48m 43s", "remaining_time": "3h 48m 20s", "loss_scale": 1.0, "consumed_samples": 1095168, "global_step/max_steps": "4278/6362"} +{"lm loss": 4.87966108, "grad_norm": 0.40313631, "learning_rate": 2.876e-05, "elapsed_time_per_iteration": 6.49745393, "memory(GiB)": 21.51, "elapsed_time": "7h 48m 50s", "remaining_time": "3h 48m 13s", "loss_scale": 1.0, "consumed_samples": 1095424, "global_step/max_steps": "4279/6362"} +{"lm loss": 4.90146494, "grad_norm": 0.3547326, "learning_rate": 2.874e-05, "elapsed_time_per_iteration": 6.41430306, "memory(GiB)": 21.51, "elapsed_time": "7h 48m 56s", "remaining_time": "3h 48m 7s", "loss_scale": 1.0, "consumed_samples": 1095680, "global_step/max_steps": "4280/6362"} +{"lm loss": 4.89602709, "grad_norm": 0.43889126, "learning_rate": 2.871e-05, "elapsed_time_per_iteration": 6.85280418, "memory(GiB)": 21.51, "elapsed_time": "7h 49m 3s", "remaining_time": "3h 48m 0s", "loss_scale": 1.0, "consumed_samples": 1095936, "global_step/max_steps": "4281/6362"} +{"lm loss": 4.92250729, "grad_norm": 0.38959068, "learning_rate": 2.869e-05, "elapsed_time_per_iteration": 6.51401734, "memory(GiB)": 21.51, "elapsed_time": "7h 49m 10s", "remaining_time": "3h 47m 54s", "loss_scale": 1.0, "consumed_samples": 1096192, "global_step/max_steps": "4282/6362"} +{"lm loss": 4.92993927, "grad_norm": 0.39461318, "learning_rate": 2.867e-05, "elapsed_time_per_iteration": 6.59114671, "memory(GiB)": 21.51, "elapsed_time": "7h 49m 16s", "remaining_time": "3h 47m 47s", "loss_scale": 1.0, "consumed_samples": 1096448, "global_step/max_steps": "4283/6362"} +{"lm loss": 4.87812662, "grad_norm": 0.38285792, "learning_rate": 2.865e-05, "elapsed_time_per_iteration": 6.56652379, "memory(GiB)": 21.51, "elapsed_time": "7h 49m 23s", "remaining_time": "3h 47m 40s", "loss_scale": 1.0, "consumed_samples": 1096704, "global_step/max_steps": "4284/6362"} +{"lm loss": 4.89040709, "grad_norm": 0.43699747, "learning_rate": 2.862e-05, "elapsed_time_per_iteration": 6.60133958, "memory(GiB)": 21.51, "elapsed_time": "7h 49m 29s", "remaining_time": "3h 47m 34s", "loss_scale": 1.0, "consumed_samples": 1096960, "global_step/max_steps": "4285/6362"} +{"lm loss": 4.8777051, "grad_norm": 0.41242722, "learning_rate": 2.86e-05, "elapsed_time_per_iteration": 6.37202358, "memory(GiB)": 21.51, "elapsed_time": "7h 49m 36s", "remaining_time": "3h 47m 27s", "loss_scale": 1.0, "consumed_samples": 1097216, "global_step/max_steps": "4286/6362"} +{"lm loss": 4.89024544, "grad_norm": 0.39909002, "learning_rate": 2.858e-05, "elapsed_time_per_iteration": 6.65090513, "memory(GiB)": 21.51, "elapsed_time": "7h 49m 42s", "remaining_time": "3h 47m 21s", "loss_scale": 1.0, "consumed_samples": 1097472, "global_step/max_steps": "4287/6362"} +{"lm loss": 4.880198, "grad_norm": 0.51132441, "learning_rate": 2.856e-05, "elapsed_time_per_iteration": 6.43132973, "memory(GiB)": 21.51, "elapsed_time": "7h 49m 49s", "remaining_time": "3h 47m 14s", "loss_scale": 1.0, "consumed_samples": 1097728, "global_step/max_steps": "4288/6362"} +{"lm loss": 4.88689184, "grad_norm": 0.39139983, "learning_rate": 2.854e-05, "elapsed_time_per_iteration": 6.49029708, "memory(GiB)": 21.51, "elapsed_time": "7h 49m 55s", "remaining_time": "3h 47m 7s", "loss_scale": 1.0, "consumed_samples": 1097984, "global_step/max_steps": "4289/6362"} +{"lm loss": 4.90912485, "grad_norm": 0.4166187, "learning_rate": 2.851e-05, "elapsed_time_per_iteration": 6.61365771, "memory(GiB)": 21.51, "elapsed_time": "7h 50m 2s", "remaining_time": "3h 47m 1s", "loss_scale": 1.0, "consumed_samples": 1098240, "global_step/max_steps": "4290/6362"} +{"lm loss": 4.89479733, "grad_norm": 0.44530049, "learning_rate": 2.849e-05, "elapsed_time_per_iteration": 6.83491731, "memory(GiB)": 21.51, "elapsed_time": "7h 50m 9s", "remaining_time": "3h 46m 54s", "loss_scale": 1.0, "consumed_samples": 1098496, "global_step/max_steps": "4291/6362"} +{"lm loss": 4.89328194, "grad_norm": 0.41842261, "learning_rate": 2.847e-05, "elapsed_time_per_iteration": 6.54974222, "memory(GiB)": 21.51, "elapsed_time": "7h 50m 15s", "remaining_time": "3h 46m 48s", "loss_scale": 1.0, "consumed_samples": 1098752, "global_step/max_steps": "4292/6362"} +{"lm loss": 4.8895402, "grad_norm": 0.3749063, "learning_rate": 2.845e-05, "elapsed_time_per_iteration": 6.44854617, "memory(GiB)": 21.51, "elapsed_time": "7h 50m 22s", "remaining_time": "3h 46m 41s", "loss_scale": 1.0, "consumed_samples": 1099008, "global_step/max_steps": "4293/6362"} +{"lm loss": 4.88919306, "grad_norm": 0.424447, "learning_rate": 2.842e-05, "elapsed_time_per_iteration": 6.38014507, "memory(GiB)": 21.51, "elapsed_time": "7h 50m 28s", "remaining_time": "3h 46m 34s", "loss_scale": 1.0, "consumed_samples": 1099264, "global_step/max_steps": "4294/6362"} +{"lm loss": 4.90527058, "grad_norm": 0.43611121, "learning_rate": 2.84e-05, "elapsed_time_per_iteration": 6.50218797, "memory(GiB)": 21.51, "elapsed_time": "7h 50m 35s", "remaining_time": "3h 46m 28s", "loss_scale": 1.0, "consumed_samples": 1099520, "global_step/max_steps": "4295/6362"} +{"lm loss": 4.90095139, "grad_norm": 0.44621876, "learning_rate": 2.838e-05, "elapsed_time_per_iteration": 6.67656589, "memory(GiB)": 21.51, "elapsed_time": "7h 50m 41s", "remaining_time": "3h 46m 21s", "loss_scale": 1.0, "consumed_samples": 1099776, "global_step/max_steps": "4296/6362"} +{"lm loss": 4.90346336, "grad_norm": 0.39831054, "learning_rate": 2.836e-05, "elapsed_time_per_iteration": 6.52419782, "memory(GiB)": 21.51, "elapsed_time": "7h 50m 48s", "remaining_time": "3h 46m 15s", "loss_scale": 1.0, "consumed_samples": 1100032, "global_step/max_steps": "4297/6362"} +{"lm loss": 4.8684864, "grad_norm": 0.40411037, "learning_rate": 2.834e-05, "elapsed_time_per_iteration": 6.38600731, "memory(GiB)": 21.51, "elapsed_time": "7h 50m 54s", "remaining_time": "3h 46m 8s", "loss_scale": 1.0, "consumed_samples": 1100288, "global_step/max_steps": "4298/6362"} +{"lm loss": 4.86917973, "grad_norm": 0.48488119, "learning_rate": 2.831e-05, "elapsed_time_per_iteration": 6.38340712, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 1s", "remaining_time": "3h 46m 1s", "loss_scale": 1.0, "consumed_samples": 1100544, "global_step/max_steps": "4299/6362"} +{"lm loss": 4.89134026, "grad_norm": 0.39673793, "learning_rate": 2.829e-05, "elapsed_time_per_iteration": 6.36225748, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 7s", "remaining_time": "3h 45m 55s", "loss_scale": 1.0, "consumed_samples": 1100800, "global_step/max_steps": "4300/6362"} +{"lm loss": 4.90103865, "grad_norm": 0.42883497, "learning_rate": 2.827e-05, "elapsed_time_per_iteration": 6.53225446, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 13s", "remaining_time": "3h 45m 48s", "loss_scale": 1.0, "consumed_samples": 1101056, "global_step/max_steps": "4301/6362"} +{"lm loss": 4.90285778, "grad_norm": 0.41700095, "learning_rate": 2.825e-05, "elapsed_time_per_iteration": 6.33127046, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 20s", "remaining_time": "3h 45m 41s", "loss_scale": 1.0, "consumed_samples": 1101312, "global_step/max_steps": "4302/6362"} +{"lm loss": 4.89560747, "grad_norm": 0.39435741, "learning_rate": 2.823e-05, "elapsed_time_per_iteration": 6.53820705, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 26s", "remaining_time": "3h 45m 35s", "loss_scale": 1.0, "consumed_samples": 1101568, "global_step/max_steps": "4303/6362"} +{"lm loss": 4.88238859, "grad_norm": 0.43042326, "learning_rate": 2.82e-05, "elapsed_time_per_iteration": 6.33335638, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 33s", "remaining_time": "3h 45m 28s", "loss_scale": 1.0, "consumed_samples": 1101824, "global_step/max_steps": "4304/6362"} +{"lm loss": 4.88034439, "grad_norm": 0.43160787, "learning_rate": 2.818e-05, "elapsed_time_per_iteration": 6.37012029, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 39s", "remaining_time": "3h 45m 21s", "loss_scale": 1.0, "consumed_samples": 1102080, "global_step/max_steps": "4305/6362"} +{"lm loss": 4.89545441, "grad_norm": 0.38265204, "learning_rate": 2.816e-05, "elapsed_time_per_iteration": 6.48319674, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 46s", "remaining_time": "3h 45m 15s", "loss_scale": 1.0, "consumed_samples": 1102336, "global_step/max_steps": "4306/6362"} +{"lm loss": 4.89347839, "grad_norm": 0.4361763, "learning_rate": 2.814e-05, "elapsed_time_per_iteration": 6.45824647, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 52s", "remaining_time": "3h 45m 8s", "loss_scale": 1.0, "consumed_samples": 1102592, "global_step/max_steps": "4307/6362"} +{"lm loss": 4.88594246, "grad_norm": 0.38593534, "learning_rate": 2.811e-05, "elapsed_time_per_iteration": 6.46037674, "memory(GiB)": 21.51, "elapsed_time": "7h 51m 58s", "remaining_time": "3h 45m 2s", "loss_scale": 1.0, "consumed_samples": 1102848, "global_step/max_steps": "4308/6362"} +{"lm loss": 4.88140106, "grad_norm": 0.41650262, "learning_rate": 2.809e-05, "elapsed_time_per_iteration": 6.44546652, "memory(GiB)": 21.51, "elapsed_time": "7h 52m 5s", "remaining_time": "3h 44m 55s", "loss_scale": 1.0, "consumed_samples": 1103104, "global_step/max_steps": "4309/6362"} +{"lm loss": 4.88196087, "grad_norm": 0.42118639, "learning_rate": 2.807e-05, "elapsed_time_per_iteration": 6.6420064, "memory(GiB)": 21.51, "elapsed_time": "7h 52m 12s", "remaining_time": "3h 44m 48s", "loss_scale": 1.0, "consumed_samples": 1103360, "global_step/max_steps": "4310/6362"} +{"lm loss": 4.8905158, "grad_norm": 0.39856476, "learning_rate": 2.805e-05, "elapsed_time_per_iteration": 6.65781283, "memory(GiB)": 21.51, "elapsed_time": "7h 52m 18s", "remaining_time": "3h 44m 42s", "loss_scale": 1.0, "consumed_samples": 1103616, "global_step/max_steps": "4311/6362"} +{"lm loss": 4.87739038, "grad_norm": 0.38050994, "learning_rate": 2.803e-05, "elapsed_time_per_iteration": 6.56321287, "memory(GiB)": 21.51, "elapsed_time": "7h 52m 25s", "remaining_time": "3h 44m 35s", "loss_scale": 1.0, "consumed_samples": 1103872, "global_step/max_steps": "4312/6362"} +{"lm loss": 4.91946507, "grad_norm": 0.401784, "learning_rate": 2.8e-05, "elapsed_time_per_iteration": 6.4543891, "memory(GiB)": 21.51, "elapsed_time": "7h 52m 31s", "remaining_time": "3h 44m 29s", "loss_scale": 1.0, "consumed_samples": 1104128, "global_step/max_steps": "4313/6362"} +{"lm loss": 4.86904001, "grad_norm": 0.39247468, "learning_rate": 2.798e-05, "elapsed_time_per_iteration": 6.50091767, "memory(GiB)": 21.51, "elapsed_time": "7h 52m 38s", "remaining_time": "3h 44m 22s", "loss_scale": 1.0, "consumed_samples": 1104384, "global_step/max_steps": "4314/6362"} +{"lm loss": 4.89747, "grad_norm": 0.39871031, "learning_rate": 2.796e-05, "elapsed_time_per_iteration": 6.56521893, "memory(GiB)": 21.51, "elapsed_time": "7h 52m 44s", "remaining_time": "3h 44m 16s", "loss_scale": 1.0, "consumed_samples": 1104640, "global_step/max_steps": "4315/6362"} +{"lm loss": 4.88735247, "grad_norm": 0.41864461, "learning_rate": 2.794e-05, "elapsed_time_per_iteration": 6.67278218, "memory(GiB)": 21.51, "elapsed_time": "7h 52m 51s", "remaining_time": "3h 44m 9s", "loss_scale": 1.0, "consumed_samples": 1104896, "global_step/max_steps": "4316/6362"} +{"lm loss": 4.86165905, "grad_norm": 0.40974486, "learning_rate": 2.792e-05, "elapsed_time_per_iteration": 6.51896214, "memory(GiB)": 21.51, "elapsed_time": "7h 52m 57s", "remaining_time": "3h 44m 2s", "loss_scale": 1.0, "consumed_samples": 1105152, "global_step/max_steps": "4317/6362"} +{"lm loss": 4.87314606, "grad_norm": 0.41077593, "learning_rate": 2.789e-05, "elapsed_time_per_iteration": 6.94946527, "memory(GiB)": 21.51, "elapsed_time": "7h 53m 4s", "remaining_time": "3h 43m 56s", "loss_scale": 1.0, "consumed_samples": 1105408, "global_step/max_steps": "4318/6362"} +{"lm loss": 4.86842299, "grad_norm": 0.43479291, "learning_rate": 2.787e-05, "elapsed_time_per_iteration": 6.92301798, "memory(GiB)": 21.51, "elapsed_time": "7h 53m 11s", "remaining_time": "3h 43m 50s", "loss_scale": 1.0, "consumed_samples": 1105664, "global_step/max_steps": "4319/6362"} +{"lm loss": 4.88446522, "grad_norm": 0.40047273, "learning_rate": 2.785e-05, "elapsed_time_per_iteration": 6.51115131, "memory(GiB)": 21.51, "elapsed_time": "7h 53m 18s", "remaining_time": "3h 43m 43s", "loss_scale": 1.0, "consumed_samples": 1105920, "global_step/max_steps": "4320/6362"} +{"lm loss": 4.89418936, "grad_norm": 0.39456481, "learning_rate": 2.783e-05, "elapsed_time_per_iteration": 6.46522665, "memory(GiB)": 21.51, "elapsed_time": "7h 53m 24s", "remaining_time": "3h 43m 36s", "loss_scale": 1.0, "consumed_samples": 1106176, "global_step/max_steps": "4321/6362"} +{"lm loss": 4.87464809, "grad_norm": 0.39485824, "learning_rate": 2.781e-05, "elapsed_time_per_iteration": 6.50598001, "memory(GiB)": 21.51, "elapsed_time": "7h 53m 31s", "remaining_time": "3h 43m 30s", "loss_scale": 1.0, "consumed_samples": 1106432, "global_step/max_steps": "4322/6362"} +{"lm loss": 4.8869133, "grad_norm": 0.39205548, "learning_rate": 2.778e-05, "elapsed_time_per_iteration": 6.49305844, "memory(GiB)": 21.51, "elapsed_time": "7h 53m 37s", "remaining_time": "3h 43m 23s", "loss_scale": 1.0, "consumed_samples": 1106688, "global_step/max_steps": "4323/6362"} +{"lm loss": 4.89734554, "grad_norm": 0.36439347, "learning_rate": 2.776e-05, "elapsed_time_per_iteration": 6.56113219, "memory(GiB)": 21.51, "elapsed_time": "7h 53m 44s", "remaining_time": "3h 43m 17s", "loss_scale": 1.0, "consumed_samples": 1106944, "global_step/max_steps": "4324/6362"} +{"lm loss": 4.9198947, "grad_norm": 0.38956395, "learning_rate": 2.774e-05, "elapsed_time_per_iteration": 6.62798691, "memory(GiB)": 21.51, "elapsed_time": "7h 53m 50s", "remaining_time": "3h 43m 10s", "loss_scale": 1.0, "consumed_samples": 1107200, "global_step/max_steps": "4325/6362"} +{"lm loss": 4.89109564, "grad_norm": 0.37747753, "learning_rate": 2.772e-05, "elapsed_time_per_iteration": 6.75139499, "memory(GiB)": 21.51, "elapsed_time": "7h 53m 57s", "remaining_time": "3h 43m 4s", "loss_scale": 1.0, "consumed_samples": 1107456, "global_step/max_steps": "4326/6362"} +{"lm loss": 4.86092281, "grad_norm": 0.38939896, "learning_rate": 2.77e-05, "elapsed_time_per_iteration": 6.71582437, "memory(GiB)": 21.51, "elapsed_time": "7h 54m 4s", "remaining_time": "3h 42m 57s", "loss_scale": 1.0, "consumed_samples": 1107712, "global_step/max_steps": "4327/6362"} +{"lm loss": 4.89362192, "grad_norm": 0.34696007, "learning_rate": 2.767e-05, "elapsed_time_per_iteration": 6.64505816, "memory(GiB)": 21.51, "elapsed_time": "7h 54m 11s", "remaining_time": "3h 42m 50s", "loss_scale": 1.0, "consumed_samples": 1107968, "global_step/max_steps": "4328/6362"} +{"lm loss": 4.89465523, "grad_norm": 0.41047525, "learning_rate": 2.765e-05, "elapsed_time_per_iteration": 6.62084794, "memory(GiB)": 21.51, "elapsed_time": "7h 54m 17s", "remaining_time": "3h 42m 44s", "loss_scale": 1.0, "consumed_samples": 1108224, "global_step/max_steps": "4329/6362"} +{"lm loss": 4.88913727, "grad_norm": 0.40352932, "learning_rate": 2.763e-05, "elapsed_time_per_iteration": 6.6718595, "memory(GiB)": 21.51, "elapsed_time": "7h 54m 24s", "remaining_time": "3h 42m 37s", "loss_scale": 1.0, "consumed_samples": 1108480, "global_step/max_steps": "4330/6362"} +{"lm loss": 4.88503885, "grad_norm": 0.39702278, "learning_rate": 2.761e-05, "elapsed_time_per_iteration": 6.53872824, "memory(GiB)": 21.51, "elapsed_time": "7h 54m 30s", "remaining_time": "3h 42m 31s", "loss_scale": 1.0, "consumed_samples": 1108736, "global_step/max_steps": "4331/6362"} +{"lm loss": 4.90645075, "grad_norm": 0.40269494, "learning_rate": 2.759e-05, "elapsed_time_per_iteration": 6.713974, "memory(GiB)": 21.51, "elapsed_time": "7h 54m 37s", "remaining_time": "3h 42m 24s", "loss_scale": 1.0, "consumed_samples": 1108992, "global_step/max_steps": "4332/6362"} +{"lm loss": 4.8865509, "grad_norm": 0.45880434, "learning_rate": 2.756e-05, "elapsed_time_per_iteration": 6.50587416, "memory(GiB)": 21.51, "elapsed_time": "7h 54m 44s", "remaining_time": "3h 42m 18s", "loss_scale": 1.0, "consumed_samples": 1109248, "global_step/max_steps": "4333/6362"} +{"lm loss": 4.89714384, "grad_norm": 0.47362378, "learning_rate": 2.754e-05, "elapsed_time_per_iteration": 6.60385489, "memory(GiB)": 21.51, "elapsed_time": "7h 54m 50s", "remaining_time": "3h 42m 11s", "loss_scale": 1.0, "consumed_samples": 1109504, "global_step/max_steps": "4334/6362"} +{"lm loss": 4.87450838, "grad_norm": 0.43053687, "learning_rate": 2.752e-05, "elapsed_time_per_iteration": 6.6103549, "memory(GiB)": 21.51, "elapsed_time": "7h 54m 57s", "remaining_time": "3h 42m 5s", "loss_scale": 1.0, "consumed_samples": 1109760, "global_step/max_steps": "4335/6362"} +{"lm loss": 4.89623594, "grad_norm": 0.4547841, "learning_rate": 2.75e-05, "elapsed_time_per_iteration": 6.76608968, "memory(GiB)": 21.51, "elapsed_time": "7h 55m 4s", "remaining_time": "3h 41m 58s", "loss_scale": 1.0, "consumed_samples": 1110016, "global_step/max_steps": "4336/6362"} +{"lm loss": 4.88836384, "grad_norm": 0.42527935, "learning_rate": 2.748e-05, "elapsed_time_per_iteration": 6.67900634, "memory(GiB)": 21.51, "elapsed_time": "7h 55m 10s", "remaining_time": "3h 41m 52s", "loss_scale": 1.0, "consumed_samples": 1110272, "global_step/max_steps": "4337/6362"} +{"lm loss": 4.89629745, "grad_norm": 0.46789727, "learning_rate": 2.745e-05, "elapsed_time_per_iteration": 6.65030956, "memory(GiB)": 21.51, "elapsed_time": "7h 55m 17s", "remaining_time": "3h 41m 45s", "loss_scale": 1.0, "consumed_samples": 1110528, "global_step/max_steps": "4338/6362"} +{"lm loss": 4.9186635, "grad_norm": 0.47823763, "learning_rate": 2.743e-05, "elapsed_time_per_iteration": 6.64083648, "memory(GiB)": 21.51, "elapsed_time": "7h 55m 24s", "remaining_time": "3h 41m 38s", "loss_scale": 1.0, "consumed_samples": 1110784, "global_step/max_steps": "4339/6362"} +{"lm loss": 4.86568499, "grad_norm": 0.40271482, "learning_rate": 2.741e-05, "elapsed_time_per_iteration": 6.70158887, "memory(GiB)": 21.51, "elapsed_time": "7h 55m 30s", "remaining_time": "3h 41m 32s", "loss_scale": 1.0, "consumed_samples": 1111040, "global_step/max_steps": "4340/6362"} +{"lm loss": 4.878438, "grad_norm": 0.41456789, "learning_rate": 2.739e-05, "elapsed_time_per_iteration": 6.69511819, "memory(GiB)": 21.51, "elapsed_time": "7h 55m 37s", "remaining_time": "3h 41m 25s", "loss_scale": 1.0, "consumed_samples": 1111296, "global_step/max_steps": "4341/6362"} +{"lm loss": 4.89671659, "grad_norm": 0.40866849, "learning_rate": 2.737e-05, "elapsed_time_per_iteration": 6.6067462, "memory(GiB)": 21.51, "elapsed_time": "7h 55m 44s", "remaining_time": "3h 41m 19s", "loss_scale": 1.0, "consumed_samples": 1111552, "global_step/max_steps": "4342/6362"} +{"lm loss": 4.87789726, "grad_norm": 0.39799228, "learning_rate": 2.735e-05, "elapsed_time_per_iteration": 6.5439384, "memory(GiB)": 21.51, "elapsed_time": "7h 55m 50s", "remaining_time": "3h 41m 12s", "loss_scale": 1.0, "consumed_samples": 1111808, "global_step/max_steps": "4343/6362"} +{"lm loss": 4.90185165, "grad_norm": 0.37665644, "learning_rate": 2.732e-05, "elapsed_time_per_iteration": 6.53897929, "memory(GiB)": 21.51, "elapsed_time": "7h 55m 57s", "remaining_time": "3h 41m 6s", "loss_scale": 1.0, "consumed_samples": 1112064, "global_step/max_steps": "4344/6362"} +{"lm loss": 4.87580442, "grad_norm": 0.41557065, "learning_rate": 2.73e-05, "elapsed_time_per_iteration": 6.52404332, "memory(GiB)": 21.51, "elapsed_time": "7h 56m 3s", "remaining_time": "3h 40m 59s", "loss_scale": 1.0, "consumed_samples": 1112320, "global_step/max_steps": "4345/6362"} +{"lm loss": 4.90891647, "grad_norm": 0.36187905, "learning_rate": 2.728e-05, "elapsed_time_per_iteration": 6.62152696, "memory(GiB)": 21.51, "elapsed_time": "7h 56m 10s", "remaining_time": "3h 40m 53s", "loss_scale": 1.0, "consumed_samples": 1112576, "global_step/max_steps": "4346/6362"} +{"lm loss": 4.89335823, "grad_norm": 0.38164186, "learning_rate": 2.726e-05, "elapsed_time_per_iteration": 6.46182656, "memory(GiB)": 21.51, "elapsed_time": "7h 56m 16s", "remaining_time": "3h 40m 46s", "loss_scale": 1.0, "consumed_samples": 1112832, "global_step/max_steps": "4347/6362"} +{"lm loss": 4.87120295, "grad_norm": 0.36388421, "learning_rate": 2.724e-05, "elapsed_time_per_iteration": 6.64114976, "memory(GiB)": 21.51, "elapsed_time": "7h 56m 23s", "remaining_time": "3h 40m 39s", "loss_scale": 1.0, "consumed_samples": 1113088, "global_step/max_steps": "4348/6362"} +{"lm loss": 4.86940193, "grad_norm": 0.36735654, "learning_rate": 2.721e-05, "elapsed_time_per_iteration": 6.3358016, "memory(GiB)": 21.51, "elapsed_time": "7h 56m 29s", "remaining_time": "3h 40m 33s", "loss_scale": 1.0, "consumed_samples": 1113344, "global_step/max_steps": "4349/6362"} +{"lm loss": 4.88273525, "grad_norm": 0.3820228, "learning_rate": 2.719e-05, "elapsed_time_per_iteration": 6.37322688, "memory(GiB)": 21.51, "elapsed_time": "7h 56m 36s", "remaining_time": "3h 40m 26s", "loss_scale": 1.0, "consumed_samples": 1113600, "global_step/max_steps": "4350/6362"} +{"lm loss": 4.9104991, "grad_norm": 0.38553983, "learning_rate": 2.717e-05, "elapsed_time_per_iteration": 6.54150128, "memory(GiB)": 21.51, "elapsed_time": "7h 56m 42s", "remaining_time": "3h 40m 19s", "loss_scale": 1.0, "consumed_samples": 1113856, "global_step/max_steps": "4351/6362"} +{"lm loss": 4.860991, "grad_norm": 0.39246395, "learning_rate": 2.715e-05, "elapsed_time_per_iteration": 6.44847083, "memory(GiB)": 21.51, "elapsed_time": "7h 56m 49s", "remaining_time": "3h 40m 13s", "loss_scale": 1.0, "consumed_samples": 1114112, "global_step/max_steps": "4352/6362"} +{"lm loss": 4.87955618, "grad_norm": 0.36474231, "learning_rate": 2.713e-05, "elapsed_time_per_iteration": 6.53092957, "memory(GiB)": 21.51, "elapsed_time": "7h 56m 55s", "remaining_time": "3h 40m 6s", "loss_scale": 1.0, "consumed_samples": 1114368, "global_step/max_steps": "4353/6362"} +{"lm loss": 4.88419962, "grad_norm": 0.36699873, "learning_rate": 2.711e-05, "elapsed_time_per_iteration": 6.47754145, "memory(GiB)": 21.51, "elapsed_time": "7h 57m 2s", "remaining_time": "3h 40m 0s", "loss_scale": 1.0, "consumed_samples": 1114624, "global_step/max_steps": "4354/6362"} +{"lm loss": 4.86206961, "grad_norm": 0.37527463, "learning_rate": 2.708e-05, "elapsed_time_per_iteration": 6.29359102, "memory(GiB)": 21.51, "elapsed_time": "7h 57m 8s", "remaining_time": "3h 39m 53s", "loss_scale": 1.0, "consumed_samples": 1114880, "global_step/max_steps": "4355/6362"} +{"lm loss": 4.88391638, "grad_norm": 0.39860117, "learning_rate": 2.706e-05, "elapsed_time_per_iteration": 6.46397758, "memory(GiB)": 21.51, "elapsed_time": "7h 57m 14s", "remaining_time": "3h 39m 46s", "loss_scale": 1.0, "consumed_samples": 1115136, "global_step/max_steps": "4356/6362"} +{"lm loss": 4.89116287, "grad_norm": 0.35177475, "learning_rate": 2.704e-05, "elapsed_time_per_iteration": 6.7063396, "memory(GiB)": 21.51, "elapsed_time": "7h 57m 21s", "remaining_time": "3h 39m 40s", "loss_scale": 1.0, "consumed_samples": 1115392, "global_step/max_steps": "4357/6362"} +{"lm loss": 4.89581919, "grad_norm": 0.39619765, "learning_rate": 2.702e-05, "elapsed_time_per_iteration": 6.40909243, "memory(GiB)": 21.51, "elapsed_time": "7h 57m 28s", "remaining_time": "3h 39m 33s", "loss_scale": 1.0, "consumed_samples": 1115648, "global_step/max_steps": "4358/6362"} +{"lm loss": 4.87318468, "grad_norm": 0.35694966, "learning_rate": 2.7e-05, "elapsed_time_per_iteration": 6.67711496, "memory(GiB)": 21.51, "elapsed_time": "7h 57m 34s", "remaining_time": "3h 39m 27s", "loss_scale": 1.0, "consumed_samples": 1115904, "global_step/max_steps": "4359/6362"} +{"lm loss": 4.87981987, "grad_norm": 0.3955107, "learning_rate": 2.697e-05, "elapsed_time_per_iteration": 6.37321901, "memory(GiB)": 21.51, "elapsed_time": "7h 57m 41s", "remaining_time": "3h 39m 20s", "loss_scale": 1.0, "consumed_samples": 1116160, "global_step/max_steps": "4360/6362"} +{"lm loss": 4.90762901, "grad_norm": 0.42433617, "learning_rate": 2.695e-05, "elapsed_time_per_iteration": 6.43423486, "memory(GiB)": 21.51, "elapsed_time": "7h 57m 47s", "remaining_time": "3h 39m 13s", "loss_scale": 1.0, "consumed_samples": 1116416, "global_step/max_steps": "4361/6362"} +{"lm loss": 4.86943388, "grad_norm": 0.45331311, "learning_rate": 2.693e-05, "elapsed_time_per_iteration": 6.8083818, "memory(GiB)": 21.51, "elapsed_time": "7h 57m 54s", "remaining_time": "3h 39m 7s", "loss_scale": 1.0, "consumed_samples": 1116672, "global_step/max_steps": "4362/6362"} +{"lm loss": 4.90455437, "grad_norm": 0.4104369, "learning_rate": 2.691e-05, "elapsed_time_per_iteration": 6.40953445, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 0s", "remaining_time": "3h 39m 0s", "loss_scale": 1.0, "consumed_samples": 1116928, "global_step/max_steps": "4363/6362"} +{"lm loss": 4.8905592, "grad_norm": 0.42664695, "learning_rate": 2.689e-05, "elapsed_time_per_iteration": 6.38039303, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 7s", "remaining_time": "3h 38m 54s", "loss_scale": 1.0, "consumed_samples": 1117184, "global_step/max_steps": "4364/6362"} +{"lm loss": 4.89151573, "grad_norm": 0.4128623, "learning_rate": 2.687e-05, "elapsed_time_per_iteration": 6.49349594, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 13s", "remaining_time": "3h 38m 47s", "loss_scale": 1.0, "consumed_samples": 1117440, "global_step/max_steps": "4365/6362"} +{"lm loss": 4.90859509, "grad_norm": 0.39695093, "learning_rate": 2.684e-05, "elapsed_time_per_iteration": 6.37739587, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 19s", "remaining_time": "3h 38m 40s", "loss_scale": 1.0, "consumed_samples": 1117696, "global_step/max_steps": "4366/6362"} +{"lm loss": 4.90897799, "grad_norm": 0.39508477, "learning_rate": 2.682e-05, "elapsed_time_per_iteration": 6.39745069, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 26s", "remaining_time": "3h 38m 34s", "loss_scale": 1.0, "consumed_samples": 1117952, "global_step/max_steps": "4367/6362"} +{"lm loss": 4.88508654, "grad_norm": 0.39744624, "learning_rate": 2.68e-05, "elapsed_time_per_iteration": 6.40342331, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 32s", "remaining_time": "3h 38m 27s", "loss_scale": 1.0, "consumed_samples": 1118208, "global_step/max_steps": "4368/6362"} +{"lm loss": 4.90649366, "grad_norm": 0.39387977, "learning_rate": 2.678e-05, "elapsed_time_per_iteration": 6.55478001, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 39s", "remaining_time": "3h 38m 20s", "loss_scale": 1.0, "consumed_samples": 1118464, "global_step/max_steps": "4369/6362"} +{"lm loss": 4.88783741, "grad_norm": 0.37917736, "learning_rate": 2.676e-05, "elapsed_time_per_iteration": 6.47203445, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 45s", "remaining_time": "3h 38m 14s", "loss_scale": 1.0, "consumed_samples": 1118720, "global_step/max_steps": "4370/6362"} +{"lm loss": 4.91193151, "grad_norm": 0.43738618, "learning_rate": 2.674e-05, "elapsed_time_per_iteration": 6.75085855, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 52s", "remaining_time": "3h 38m 7s", "loss_scale": 1.0, "consumed_samples": 1118976, "global_step/max_steps": "4371/6362"} +{"lm loss": 4.8919611, "grad_norm": 0.40973032, "learning_rate": 2.671e-05, "elapsed_time_per_iteration": 6.5194087, "memory(GiB)": 21.51, "elapsed_time": "7h 58m 59s", "remaining_time": "3h 38m 1s", "loss_scale": 1.0, "consumed_samples": 1119232, "global_step/max_steps": "4372/6362"} +{"lm loss": 4.86876011, "grad_norm": 0.40231165, "learning_rate": 2.669e-05, "elapsed_time_per_iteration": 6.37726068, "memory(GiB)": 21.51, "elapsed_time": "7h 59m 5s", "remaining_time": "3h 37m 54s", "loss_scale": 1.0, "consumed_samples": 1119488, "global_step/max_steps": "4373/6362"} +{"lm loss": 4.87537766, "grad_norm": 0.41950843, "learning_rate": 2.667e-05, "elapsed_time_per_iteration": 6.61503768, "memory(GiB)": 21.51, "elapsed_time": "7h 59m 12s", "remaining_time": "3h 37m 47s", "loss_scale": 1.0, "consumed_samples": 1119744, "global_step/max_steps": "4374/6362"} +{"lm loss": 4.89472151, "grad_norm": 0.40787089, "learning_rate": 2.665e-05, "elapsed_time_per_iteration": 6.39804697, "memory(GiB)": 21.51, "elapsed_time": "7h 59m 18s", "remaining_time": "3h 37m 41s", "loss_scale": 1.0, "consumed_samples": 1120000, "global_step/max_steps": "4375/6362"} +{"lm loss": 4.90121126, "grad_norm": 0.39731309, "learning_rate": 2.663e-05, "elapsed_time_per_iteration": 6.47389007, "memory(GiB)": 21.51, "elapsed_time": "7h 59m 24s", "remaining_time": "3h 37m 34s", "loss_scale": 1.0, "consumed_samples": 1120256, "global_step/max_steps": "4376/6362"} +{"lm loss": 4.87623692, "grad_norm": 0.3775548, "learning_rate": 2.661e-05, "elapsed_time_per_iteration": 6.53156352, "memory(GiB)": 21.51, "elapsed_time": "7h 59m 31s", "remaining_time": "3h 37m 28s", "loss_scale": 1.0, "consumed_samples": 1120512, "global_step/max_steps": "4377/6362"} +{"lm loss": 4.88184929, "grad_norm": 0.42330208, "learning_rate": 2.658e-05, "elapsed_time_per_iteration": 6.64604616, "memory(GiB)": 21.51, "elapsed_time": "7h 59m 38s", "remaining_time": "3h 37m 21s", "loss_scale": 1.0, "consumed_samples": 1120768, "global_step/max_steps": "4378/6362"} +{"lm loss": 4.91357756, "grad_norm": 0.41261134, "learning_rate": 2.656e-05, "elapsed_time_per_iteration": 6.50814772, "memory(GiB)": 21.51, "elapsed_time": "7h 59m 44s", "remaining_time": "3h 37m 14s", "loss_scale": 1.0, "consumed_samples": 1121024, "global_step/max_steps": "4379/6362"} +{"lm loss": 4.8888154, "grad_norm": 0.40855455, "learning_rate": 2.654e-05, "elapsed_time_per_iteration": 6.27190566, "memory(GiB)": 21.51, "elapsed_time": "7h 59m 50s", "remaining_time": "3h 37m 8s", "loss_scale": 1.0, "consumed_samples": 1121280, "global_step/max_steps": "4380/6362"} +{"lm loss": 4.88993692, "grad_norm": 0.40940574, "learning_rate": 2.652e-05, "elapsed_time_per_iteration": 6.6264205, "memory(GiB)": 21.51, "elapsed_time": "7h 59m 57s", "remaining_time": "3h 37m 1s", "loss_scale": 1.0, "consumed_samples": 1121536, "global_step/max_steps": "4381/6362"} +{"lm loss": 4.91422653, "grad_norm": 0.39798316, "learning_rate": 2.65e-05, "elapsed_time_per_iteration": 6.40037823, "memory(GiB)": 21.51, "elapsed_time": "8h 0m 3s", "remaining_time": "3h 36m 55s", "loss_scale": 1.0, "consumed_samples": 1121792, "global_step/max_steps": "4382/6362"} +{"lm loss": 4.90240955, "grad_norm": 0.42673793, "learning_rate": 2.648e-05, "elapsed_time_per_iteration": 6.65167594, "memory(GiB)": 21.51, "elapsed_time": "8h 0m 10s", "remaining_time": "3h 36m 48s", "loss_scale": 1.0, "consumed_samples": 1122048, "global_step/max_steps": "4383/6362"} +{"lm loss": 4.87644005, "grad_norm": 0.37552395, "learning_rate": 2.645e-05, "elapsed_time_per_iteration": 6.67903018, "memory(GiB)": 21.51, "elapsed_time": "8h 0m 17s", "remaining_time": "3h 36m 41s", "loss_scale": 1.0, "consumed_samples": 1122304, "global_step/max_steps": "4384/6362"} +{"lm loss": 4.89660311, "grad_norm": 0.46324384, "learning_rate": 2.643e-05, "elapsed_time_per_iteration": 6.42949724, "memory(GiB)": 21.51, "elapsed_time": "8h 0m 23s", "remaining_time": "3h 36m 35s", "loss_scale": 1.0, "consumed_samples": 1122560, "global_step/max_steps": "4385/6362"} +{"lm loss": 4.9068222, "grad_norm": 0.35598233, "learning_rate": 2.641e-05, "elapsed_time_per_iteration": 6.72098351, "memory(GiB)": 21.51, "elapsed_time": "8h 0m 30s", "remaining_time": "3h 36m 28s", "loss_scale": 1.0, "consumed_samples": 1122816, "global_step/max_steps": "4386/6362"} +{"lm loss": 4.89007711, "grad_norm": 0.45106676, "learning_rate": 2.639e-05, "elapsed_time_per_iteration": 6.5764792, "memory(GiB)": 21.51, "elapsed_time": "8h 0m 36s", "remaining_time": "3h 36m 22s", "loss_scale": 1.0, "consumed_samples": 1123072, "global_step/max_steps": "4387/6362"} +{"lm loss": 4.89673328, "grad_norm": 0.39245746, "learning_rate": 2.637e-05, "elapsed_time_per_iteration": 6.49725199, "memory(GiB)": 21.51, "elapsed_time": "8h 0m 43s", "remaining_time": "3h 36m 15s", "loss_scale": 1.0, "consumed_samples": 1123328, "global_step/max_steps": "4388/6362"} +{"lm loss": 4.89092541, "grad_norm": 0.39087334, "learning_rate": 2.635e-05, "elapsed_time_per_iteration": 6.76699758, "memory(GiB)": 21.51, "elapsed_time": "8h 0m 50s", "remaining_time": "3h 36m 9s", "loss_scale": 1.0, "consumed_samples": 1123584, "global_step/max_steps": "4389/6362"} +{"lm loss": 4.8706398, "grad_norm": 0.34303635, "learning_rate": 2.633e-05, "elapsed_time_per_iteration": 6.48497152, "memory(GiB)": 21.51, "elapsed_time": "8h 0m 56s", "remaining_time": "3h 36m 2s", "loss_scale": 1.0, "consumed_samples": 1123840, "global_step/max_steps": "4390/6362"} +{"lm loss": 4.90220022, "grad_norm": 0.40986702, "learning_rate": 2.63e-05, "elapsed_time_per_iteration": 6.54174733, "memory(GiB)": 21.51, "elapsed_time": "8h 1m 3s", "remaining_time": "3h 35m 55s", "loss_scale": 1.0, "consumed_samples": 1124096, "global_step/max_steps": "4391/6362"} +{"lm loss": 4.88253498, "grad_norm": 0.37461159, "learning_rate": 2.628e-05, "elapsed_time_per_iteration": 6.637151, "memory(GiB)": 21.51, "elapsed_time": "8h 1m 9s", "remaining_time": "3h 35m 49s", "loss_scale": 1.0, "consumed_samples": 1124352, "global_step/max_steps": "4392/6362"} +{"lm loss": 4.87910366, "grad_norm": 0.38698336, "learning_rate": 2.626e-05, "elapsed_time_per_iteration": 6.50958562, "memory(GiB)": 21.51, "elapsed_time": "8h 1m 16s", "remaining_time": "3h 35m 42s", "loss_scale": 1.0, "consumed_samples": 1124608, "global_step/max_steps": "4393/6362"} +{"lm loss": 4.89380217, "grad_norm": 0.36216903, "learning_rate": 2.624e-05, "elapsed_time_per_iteration": 6.47078609, "memory(GiB)": 21.51, "elapsed_time": "8h 1m 22s", "remaining_time": "3h 35m 36s", "loss_scale": 1.0, "consumed_samples": 1124864, "global_step/max_steps": "4394/6362"} +{"lm loss": 4.89933252, "grad_norm": 0.36928323, "learning_rate": 2.622e-05, "elapsed_time_per_iteration": 6.80255246, "memory(GiB)": 21.51, "elapsed_time": "8h 1m 29s", "remaining_time": "3h 35m 29s", "loss_scale": 1.0, "consumed_samples": 1125120, "global_step/max_steps": "4395/6362"} +{"lm loss": 4.88337898, "grad_norm": 0.399584, "learning_rate": 2.62e-05, "elapsed_time_per_iteration": 6.45197892, "memory(GiB)": 21.51, "elapsed_time": "8h 1m 36s", "remaining_time": "3h 35m 23s", "loss_scale": 1.0, "consumed_samples": 1125376, "global_step/max_steps": "4396/6362"} +{"lm loss": 4.90188265, "grad_norm": 0.4126415, "learning_rate": 2.617e-05, "elapsed_time_per_iteration": 6.26224136, "memory(GiB)": 21.51, "elapsed_time": "8h 1m 42s", "remaining_time": "3h 35m 16s", "loss_scale": 1.0, "consumed_samples": 1125632, "global_step/max_steps": "4397/6362"} +{"lm loss": 4.89885807, "grad_norm": 0.37069967, "learning_rate": 2.615e-05, "elapsed_time_per_iteration": 6.3120091, "memory(GiB)": 21.51, "elapsed_time": "8h 1m 48s", "remaining_time": "3h 35m 9s", "loss_scale": 1.0, "consumed_samples": 1125888, "global_step/max_steps": "4398/6362"} +{"lm loss": 4.90181255, "grad_norm": 0.38961247, "learning_rate": 2.613e-05, "elapsed_time_per_iteration": 6.42454767, "memory(GiB)": 21.51, "elapsed_time": "8h 1m 55s", "remaining_time": "3h 35m 3s", "loss_scale": 1.0, "consumed_samples": 1126144, "global_step/max_steps": "4399/6362"} +{"lm loss": 4.90117264, "grad_norm": 0.38063636, "learning_rate": 2.611e-05, "elapsed_time_per_iteration": 6.48531651, "memory(GiB)": 21.51, "elapsed_time": "8h 2m 1s", "remaining_time": "3h 34m 56s", "loss_scale": 1.0, "consumed_samples": 1126400, "global_step/max_steps": "4400/6362"} +{"lm loss": 4.89445639, "grad_norm": 0.38636845, "learning_rate": 2.609e-05, "elapsed_time_per_iteration": 6.31602407, "memory(GiB)": 21.51, "elapsed_time": "8h 2m 7s", "remaining_time": "3h 34m 49s", "loss_scale": 1.0, "consumed_samples": 1126656, "global_step/max_steps": "4401/6362"} +{"lm loss": 4.88985109, "grad_norm": 0.33797655, "learning_rate": 2.607e-05, "elapsed_time_per_iteration": 6.52740574, "memory(GiB)": 21.51, "elapsed_time": "8h 2m 14s", "remaining_time": "3h 34m 43s", "loss_scale": 1.0, "consumed_samples": 1126912, "global_step/max_steps": "4402/6362"} +{"lm loss": 4.92460108, "grad_norm": 0.40723136, "learning_rate": 2.605e-05, "elapsed_time_per_iteration": 6.50909376, "memory(GiB)": 21.51, "elapsed_time": "8h 2m 20s", "remaining_time": "3h 34m 36s", "loss_scale": 1.0, "consumed_samples": 1127168, "global_step/max_steps": "4403/6362"} +{"lm loss": 4.8873806, "grad_norm": 0.36370769, "learning_rate": 2.602e-05, "elapsed_time_per_iteration": 6.5455308, "memory(GiB)": 21.51, "elapsed_time": "8h 2m 27s", "remaining_time": "3h 34m 29s", "loss_scale": 1.0, "consumed_samples": 1127424, "global_step/max_steps": "4404/6362"} +{"lm loss": 4.89782906, "grad_norm": 0.3708171, "learning_rate": 2.6e-05, "elapsed_time_per_iteration": 6.47951984, "memory(GiB)": 21.51, "elapsed_time": "8h 2m 34s", "remaining_time": "3h 34m 23s", "loss_scale": 1.0, "consumed_samples": 1127680, "global_step/max_steps": "4405/6362"} +{"lm loss": 4.89757156, "grad_norm": 0.42639661, "learning_rate": 2.598e-05, "elapsed_time_per_iteration": 6.49245858, "memory(GiB)": 21.51, "elapsed_time": "8h 2m 40s", "remaining_time": "3h 34m 16s", "loss_scale": 1.0, "consumed_samples": 1127936, "global_step/max_steps": "4406/6362"} +{"lm loss": 4.88728523, "grad_norm": 0.41331446, "learning_rate": 2.596e-05, "elapsed_time_per_iteration": 6.57551527, "memory(GiB)": 21.51, "elapsed_time": "8h 2m 47s", "remaining_time": "3h 34m 10s", "loss_scale": 1.0, "consumed_samples": 1128192, "global_step/max_steps": "4407/6362"} +{"lm loss": 4.90985012, "grad_norm": 0.36924928, "learning_rate": 2.594e-05, "elapsed_time_per_iteration": 6.6318922, "memory(GiB)": 21.51, "elapsed_time": "8h 2m 53s", "remaining_time": "3h 34m 3s", "loss_scale": 1.0, "consumed_samples": 1128448, "global_step/max_steps": "4408/6362"} +{"lm loss": 4.88947725, "grad_norm": 0.41597113, "learning_rate": 2.592e-05, "elapsed_time_per_iteration": 6.50086498, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 0s", "remaining_time": "3h 33m 57s", "loss_scale": 1.0, "consumed_samples": 1128704, "global_step/max_steps": "4409/6362"} +{"lm loss": 4.88696098, "grad_norm": 0.37802997, "learning_rate": 2.59e-05, "elapsed_time_per_iteration": 6.53781962, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 6s", "remaining_time": "3h 33m 50s", "loss_scale": 1.0, "consumed_samples": 1128960, "global_step/max_steps": "4410/6362"} +{"lm loss": 4.87165928, "grad_norm": 0.40728235, "learning_rate": 2.587e-05, "elapsed_time_per_iteration": 6.49358296, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 13s", "remaining_time": "3h 33m 43s", "loss_scale": 1.0, "consumed_samples": 1129216, "global_step/max_steps": "4411/6362"} +{"lm loss": 4.87297773, "grad_norm": 0.40558845, "learning_rate": 2.585e-05, "elapsed_time_per_iteration": 6.88270807, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 20s", "remaining_time": "3h 33m 37s", "loss_scale": 1.0, "consumed_samples": 1129472, "global_step/max_steps": "4412/6362"} +{"lm loss": 4.88028479, "grad_norm": 0.35226235, "learning_rate": 2.583e-05, "elapsed_time_per_iteration": 6.71575737, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 26s", "remaining_time": "3h 33m 30s", "loss_scale": 1.0, "consumed_samples": 1129728, "global_step/max_steps": "4413/6362"} +{"lm loss": 4.8701973, "grad_norm": 0.43063956, "learning_rate": 2.581e-05, "elapsed_time_per_iteration": 6.62777686, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 33s", "remaining_time": "3h 33m 24s", "loss_scale": 1.0, "consumed_samples": 1129984, "global_step/max_steps": "4414/6362"} +{"lm loss": 4.89830971, "grad_norm": 0.38657632, "learning_rate": 2.579e-05, "elapsed_time_per_iteration": 6.57406378, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 40s", "remaining_time": "3h 33m 17s", "loss_scale": 1.0, "consumed_samples": 1130240, "global_step/max_steps": "4415/6362"} +{"lm loss": 4.90104055, "grad_norm": 0.41422924, "learning_rate": 2.577e-05, "elapsed_time_per_iteration": 6.60292053, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 46s", "remaining_time": "3h 33m 11s", "loss_scale": 1.0, "consumed_samples": 1130496, "global_step/max_steps": "4416/6362"} +{"lm loss": 4.90061951, "grad_norm": 0.38676164, "learning_rate": 2.575e-05, "elapsed_time_per_iteration": 6.56397986, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 53s", "remaining_time": "3h 33m 4s", "loss_scale": 1.0, "consumed_samples": 1130752, "global_step/max_steps": "4417/6362"} +{"lm loss": 4.86700106, "grad_norm": 0.4230938, "learning_rate": 2.572e-05, "elapsed_time_per_iteration": 6.55524898, "memory(GiB)": 21.51, "elapsed_time": "8h 3m 59s", "remaining_time": "3h 32m 58s", "loss_scale": 1.0, "consumed_samples": 1131008, "global_step/max_steps": "4418/6362"} +{"lm loss": 4.86229324, "grad_norm": 0.44232452, "learning_rate": 2.57e-05, "elapsed_time_per_iteration": 6.70634961, "memory(GiB)": 21.51, "elapsed_time": "8h 4m 6s", "remaining_time": "3h 32m 51s", "loss_scale": 1.0, "consumed_samples": 1131264, "global_step/max_steps": "4419/6362"} +{"lm loss": 4.86850882, "grad_norm": 0.38472256, "learning_rate": 2.568e-05, "elapsed_time_per_iteration": 6.65841627, "memory(GiB)": 21.51, "elapsed_time": "8h 4m 13s", "remaining_time": "3h 32m 44s", "loss_scale": 1.0, "consumed_samples": 1131520, "global_step/max_steps": "4420/6362"} +{"lm loss": 4.85966778, "grad_norm": 0.41757664, "learning_rate": 2.566e-05, "elapsed_time_per_iteration": 6.68926573, "memory(GiB)": 21.51, "elapsed_time": "8h 4m 19s", "remaining_time": "3h 32m 38s", "loss_scale": 1.0, "consumed_samples": 1131776, "global_step/max_steps": "4421/6362"} +{"lm loss": 4.88874006, "grad_norm": 0.41422492, "learning_rate": 2.564e-05, "elapsed_time_per_iteration": 6.55111241, "memory(GiB)": 21.51, "elapsed_time": "8h 4m 26s", "remaining_time": "3h 32m 31s", "loss_scale": 1.0, "consumed_samples": 1132032, "global_step/max_steps": "4422/6362"} +{"lm loss": 4.8837266, "grad_norm": 0.38798666, "learning_rate": 2.562e-05, "elapsed_time_per_iteration": 6.69090867, "memory(GiB)": 21.51, "elapsed_time": "8h 4m 33s", "remaining_time": "3h 32m 25s", "loss_scale": 1.0, "consumed_samples": 1132288, "global_step/max_steps": "4423/6362"} +{"lm loss": 4.88312769, "grad_norm": 0.39112267, "learning_rate": 2.56e-05, "elapsed_time_per_iteration": 6.81821179, "memory(GiB)": 21.51, "elapsed_time": "8h 4m 39s", "remaining_time": "3h 32m 18s", "loss_scale": 1.0, "consumed_samples": 1132544, "global_step/max_steps": "4424/6362"} +{"lm loss": 4.89535141, "grad_norm": 0.39458358, "learning_rate": 2.558e-05, "elapsed_time_per_iteration": 6.78237414, "memory(GiB)": 21.51, "elapsed_time": "8h 4m 46s", "remaining_time": "3h 32m 12s", "loss_scale": 1.0, "consumed_samples": 1132800, "global_step/max_steps": "4425/6362"} +{"lm loss": 4.88305235, "grad_norm": 0.38240799, "learning_rate": 2.555e-05, "elapsed_time_per_iteration": 6.54031086, "memory(GiB)": 21.51, "elapsed_time": "8h 4m 53s", "remaining_time": "3h 32m 5s", "loss_scale": 1.0, "consumed_samples": 1133056, "global_step/max_steps": "4426/6362"} +{"lm loss": 4.86900759, "grad_norm": 0.44785848, "learning_rate": 2.553e-05, "elapsed_time_per_iteration": 6.61735511, "memory(GiB)": 21.51, "elapsed_time": "8h 4m 59s", "remaining_time": "3h 31m 59s", "loss_scale": 1.0, "consumed_samples": 1133312, "global_step/max_steps": "4427/6362"} +{"lm loss": 4.90695143, "grad_norm": 0.38622388, "learning_rate": 2.551e-05, "elapsed_time_per_iteration": 6.48669338, "memory(GiB)": 21.51, "elapsed_time": "8h 5m 6s", "remaining_time": "3h 31m 52s", "loss_scale": 1.0, "consumed_samples": 1133568, "global_step/max_steps": "4428/6362"} +{"lm loss": 4.8840766, "grad_norm": 0.39689171, "learning_rate": 2.549e-05, "elapsed_time_per_iteration": 6.36671329, "memory(GiB)": 21.51, "elapsed_time": "8h 5m 12s", "remaining_time": "3h 31m 45s", "loss_scale": 1.0, "consumed_samples": 1133824, "global_step/max_steps": "4429/6362"} +{"lm loss": 4.87190723, "grad_norm": 0.39721125, "learning_rate": 2.547e-05, "elapsed_time_per_iteration": 6.61311913, "memory(GiB)": 21.51, "elapsed_time": "8h 5m 19s", "remaining_time": "3h 31m 39s", "loss_scale": 1.0, "consumed_samples": 1134080, "global_step/max_steps": "4430/6362"} +{"lm loss": 4.89091825, "grad_norm": 0.37602192, "learning_rate": 2.545e-05, "elapsed_time_per_iteration": 6.57923102, "memory(GiB)": 21.51, "elapsed_time": "8h 5m 25s", "remaining_time": "3h 31m 32s", "loss_scale": 1.0, "consumed_samples": 1134336, "global_step/max_steps": "4431/6362"} +{"lm loss": 4.89459705, "grad_norm": 0.39402887, "learning_rate": 2.543e-05, "elapsed_time_per_iteration": 6.57421064, "memory(GiB)": 21.51, "elapsed_time": "8h 5m 32s", "remaining_time": "3h 31m 26s", "loss_scale": 1.0, "consumed_samples": 1134592, "global_step/max_steps": "4432/6362"} +{"lm loss": 4.87234449, "grad_norm": 0.36398801, "learning_rate": 2.541e-05, "elapsed_time_per_iteration": 6.54623532, "memory(GiB)": 21.51, "elapsed_time": "8h 5m 38s", "remaining_time": "3h 31m 19s", "loss_scale": 1.0, "consumed_samples": 1134848, "global_step/max_steps": "4433/6362"} +{"lm loss": 4.88776112, "grad_norm": 0.38929567, "learning_rate": 2.538e-05, "elapsed_time_per_iteration": 6.57637596, "memory(GiB)": 21.51, "elapsed_time": "8h 5m 45s", "remaining_time": "3h 31m 13s", "loss_scale": 1.0, "consumed_samples": 1135104, "global_step/max_steps": "4434/6362"} +{"lm loss": 4.89058304, "grad_norm": 0.40719977, "learning_rate": 2.536e-05, "elapsed_time_per_iteration": 6.3752079, "memory(GiB)": 21.51, "elapsed_time": "8h 5m 51s", "remaining_time": "3h 31m 6s", "loss_scale": 1.0, "consumed_samples": 1135360, "global_step/max_steps": "4435/6362"} +{"lm loss": 4.88608551, "grad_norm": 0.38834527, "learning_rate": 2.534e-05, "elapsed_time_per_iteration": 6.60507989, "memory(GiB)": 21.51, "elapsed_time": "8h 5m 58s", "remaining_time": "3h 30m 59s", "loss_scale": 1.0, "consumed_samples": 1135616, "global_step/max_steps": "4436/6362"} +{"lm loss": 4.88318586, "grad_norm": 0.36245397, "learning_rate": 2.532e-05, "elapsed_time_per_iteration": 6.5358181, "memory(GiB)": 21.51, "elapsed_time": "8h 6m 5s", "remaining_time": "3h 30m 53s", "loss_scale": 1.0, "consumed_samples": 1135872, "global_step/max_steps": "4437/6362"} +{"lm loss": 4.885777, "grad_norm": 0.37269655, "learning_rate": 2.53e-05, "elapsed_time_per_iteration": 6.55906177, "memory(GiB)": 21.51, "elapsed_time": "8h 6m 11s", "remaining_time": "3h 30m 46s", "loss_scale": 1.0, "consumed_samples": 1136128, "global_step/max_steps": "4438/6362"} +{"lm loss": 4.89950037, "grad_norm": 0.37877628, "learning_rate": 2.528e-05, "elapsed_time_per_iteration": 6.35695004, "memory(GiB)": 21.51, "elapsed_time": "8h 6m 17s", "remaining_time": "3h 30m 40s", "loss_scale": 1.0, "consumed_samples": 1136384, "global_step/max_steps": "4439/6362"} +{"lm loss": 4.89246273, "grad_norm": 0.37899351, "learning_rate": 2.526e-05, "elapsed_time_per_iteration": 6.37346506, "memory(GiB)": 21.51, "elapsed_time": "8h 6m 24s", "remaining_time": "3h 30m 33s", "loss_scale": 1.0, "consumed_samples": 1136640, "global_step/max_steps": "4440/6362"} +{"lm loss": 4.89630604, "grad_norm": 0.37816623, "learning_rate": 2.524e-05, "elapsed_time_per_iteration": 6.5673449, "memory(GiB)": 21.51, "elapsed_time": "8h 6m 30s", "remaining_time": "3h 30m 26s", "loss_scale": 1.0, "consumed_samples": 1136896, "global_step/max_steps": "4441/6362"} +{"lm loss": 4.8522768, "grad_norm": 0.36703458, "learning_rate": 2.521e-05, "elapsed_time_per_iteration": 6.71697283, "memory(GiB)": 21.51, "elapsed_time": "8h 6m 37s", "remaining_time": "3h 30m 20s", "loss_scale": 1.0, "consumed_samples": 1137152, "global_step/max_steps": "4442/6362"} +{"lm loss": 4.86308956, "grad_norm": 0.39585015, "learning_rate": 2.519e-05, "elapsed_time_per_iteration": 6.39641118, "memory(GiB)": 21.51, "elapsed_time": "8h 6m 44s", "remaining_time": "3h 30m 13s", "loss_scale": 1.0, "consumed_samples": 1137408, "global_step/max_steps": "4443/6362"} +{"lm loss": 4.88903427, "grad_norm": 0.38709098, "learning_rate": 2.517e-05, "elapsed_time_per_iteration": 6.5926249, "memory(GiB)": 21.51, "elapsed_time": "8h 6m 50s", "remaining_time": "3h 30m 7s", "loss_scale": 1.0, "consumed_samples": 1137664, "global_step/max_steps": "4444/6362"} +{"lm loss": 4.86835003, "grad_norm": 0.34498182, "learning_rate": 2.515e-05, "elapsed_time_per_iteration": 6.69134498, "memory(GiB)": 21.51, "elapsed_time": "8h 6m 57s", "remaining_time": "3h 30m 0s", "loss_scale": 1.0, "consumed_samples": 1137920, "global_step/max_steps": "4445/6362"} +{"lm loss": 4.899508, "grad_norm": 0.35997692, "learning_rate": 2.513e-05, "elapsed_time_per_iteration": 6.71261597, "memory(GiB)": 21.51, "elapsed_time": "8h 7m 4s", "remaining_time": "3h 29m 54s", "loss_scale": 1.0, "consumed_samples": 1138176, "global_step/max_steps": "4446/6362"} +{"lm loss": 4.87091923, "grad_norm": 0.33263034, "learning_rate": 2.511e-05, "elapsed_time_per_iteration": 6.61846304, "memory(GiB)": 21.51, "elapsed_time": "8h 7m 10s", "remaining_time": "3h 29m 47s", "loss_scale": 1.0, "consumed_samples": 1138432, "global_step/max_steps": "4447/6362"} +{"lm loss": 4.84890556, "grad_norm": 0.36505291, "learning_rate": 2.509e-05, "elapsed_time_per_iteration": 6.63264441, "memory(GiB)": 21.51, "elapsed_time": "8h 7m 17s", "remaining_time": "3h 29m 40s", "loss_scale": 1.0, "consumed_samples": 1138688, "global_step/max_steps": "4448/6362"} +{"lm loss": 4.89569664, "grad_norm": 0.36095491, "learning_rate": 2.507e-05, "elapsed_time_per_iteration": 6.45569587, "memory(GiB)": 21.51, "elapsed_time": "8h 7m 23s", "remaining_time": "3h 29m 34s", "loss_scale": 1.0, "consumed_samples": 1138944, "global_step/max_steps": "4449/6362"} +{"lm loss": 4.89426517, "grad_norm": 0.34660873, "learning_rate": 2.504e-05, "elapsed_time_per_iteration": 6.47350407, "memory(GiB)": 21.51, "elapsed_time": "8h 7m 30s", "remaining_time": "3h 29m 27s", "loss_scale": 1.0, "consumed_samples": 1139200, "global_step/max_steps": "4450/6362"} +{"lm loss": 4.88330555, "grad_norm": 0.38713107, "learning_rate": 2.502e-05, "elapsed_time_per_iteration": 6.75108147, "memory(GiB)": 21.51, "elapsed_time": "8h 7m 36s", "remaining_time": "3h 29m 21s", "loss_scale": 1.0, "consumed_samples": 1139456, "global_step/max_steps": "4451/6362"} +{"lm loss": 4.88769341, "grad_norm": 0.36437193, "learning_rate": 2.5e-05, "elapsed_time_per_iteration": 6.89834094, "memory(GiB)": 21.51, "elapsed_time": "8h 7m 43s", "remaining_time": "3h 29m 14s", "loss_scale": 1.0, "consumed_samples": 1139712, "global_step/max_steps": "4452/6362"} +{"lm loss": 4.90208197, "grad_norm": 0.44095162, "learning_rate": 2.498e-05, "elapsed_time_per_iteration": 6.49097395, "memory(GiB)": 21.51, "elapsed_time": "8h 7m 50s", "remaining_time": "3h 29m 8s", "loss_scale": 1.0, "consumed_samples": 1139968, "global_step/max_steps": "4453/6362"} +{"lm loss": 4.8760457, "grad_norm": 0.39282548, "learning_rate": 2.496e-05, "elapsed_time_per_iteration": 6.61693048, "memory(GiB)": 21.51, "elapsed_time": "8h 7m 56s", "remaining_time": "3h 29m 1s", "loss_scale": 1.0, "consumed_samples": 1140224, "global_step/max_steps": "4454/6362"} +{"lm loss": 4.88249111, "grad_norm": 0.41572455, "learning_rate": 2.494e-05, "elapsed_time_per_iteration": 6.58426309, "memory(GiB)": 21.51, "elapsed_time": "8h 8m 3s", "remaining_time": "3h 28m 55s", "loss_scale": 1.0, "consumed_samples": 1140480, "global_step/max_steps": "4455/6362"} +{"lm loss": 4.88043022, "grad_norm": 0.38626435, "learning_rate": 2.492e-05, "elapsed_time_per_iteration": 6.53757334, "memory(GiB)": 21.51, "elapsed_time": "8h 8m 10s", "remaining_time": "3h 28m 48s", "loss_scale": 1.0, "consumed_samples": 1140736, "global_step/max_steps": "4456/6362"} +{"lm loss": 4.91487074, "grad_norm": 0.39396289, "learning_rate": 2.49e-05, "elapsed_time_per_iteration": 6.54826355, "memory(GiB)": 21.51, "elapsed_time": "8h 8m 16s", "remaining_time": "3h 28m 41s", "loss_scale": 1.0, "consumed_samples": 1140992, "global_step/max_steps": "4457/6362"} +{"lm loss": 4.90016937, "grad_norm": 0.36256546, "learning_rate": 2.488e-05, "elapsed_time_per_iteration": 6.6984508, "memory(GiB)": 21.51, "elapsed_time": "8h 8m 23s", "remaining_time": "3h 28m 35s", "loss_scale": 1.0, "consumed_samples": 1141248, "global_step/max_steps": "4458/6362"} +{"lm loss": 4.88076067, "grad_norm": 0.38332453, "learning_rate": 2.485e-05, "elapsed_time_per_iteration": 6.77840853, "memory(GiB)": 21.51, "elapsed_time": "8h 8m 30s", "remaining_time": "3h 28m 28s", "loss_scale": 1.0, "consumed_samples": 1141504, "global_step/max_steps": "4459/6362"} +{"lm loss": 4.90219021, "grad_norm": 0.36990389, "learning_rate": 2.483e-05, "elapsed_time_per_iteration": 6.65564942, "memory(GiB)": 21.51, "elapsed_time": "8h 8m 36s", "remaining_time": "3h 28m 22s", "loss_scale": 1.0, "consumed_samples": 1141760, "global_step/max_steps": "4460/6362"} +{"lm loss": 4.8838315, "grad_norm": 0.37522671, "learning_rate": 2.481e-05, "elapsed_time_per_iteration": 6.47637272, "memory(GiB)": 21.51, "elapsed_time": "8h 8m 43s", "remaining_time": "3h 28m 15s", "loss_scale": 1.0, "consumed_samples": 1142016, "global_step/max_steps": "4461/6362"} +{"lm loss": 4.89573002, "grad_norm": 0.3689121, "learning_rate": 2.479e-05, "elapsed_time_per_iteration": 6.50240183, "memory(GiB)": 21.51, "elapsed_time": "8h 8m 49s", "remaining_time": "3h 28m 9s", "loss_scale": 1.0, "consumed_samples": 1142272, "global_step/max_steps": "4462/6362"} +{"lm loss": 4.88702154, "grad_norm": 0.35259292, "learning_rate": 2.477e-05, "elapsed_time_per_iteration": 6.51895285, "memory(GiB)": 21.51, "elapsed_time": "8h 8m 56s", "remaining_time": "3h 28m 2s", "loss_scale": 1.0, "consumed_samples": 1142528, "global_step/max_steps": "4463/6362"} +{"lm loss": 4.88481092, "grad_norm": 0.34026054, "learning_rate": 2.475e-05, "elapsed_time_per_iteration": 6.75133967, "memory(GiB)": 21.51, "elapsed_time": "8h 9m 3s", "remaining_time": "3h 27m 56s", "loss_scale": 1.0, "consumed_samples": 1142784, "global_step/max_steps": "4464/6362"} +{"lm loss": 4.88154459, "grad_norm": 0.38804331, "learning_rate": 2.473e-05, "elapsed_time_per_iteration": 6.43832636, "memory(GiB)": 21.51, "elapsed_time": "8h 9m 9s", "remaining_time": "3h 27m 49s", "loss_scale": 1.0, "consumed_samples": 1143040, "global_step/max_steps": "4465/6362"} +{"lm loss": 4.87310839, "grad_norm": 0.3434312, "learning_rate": 2.471e-05, "elapsed_time_per_iteration": 6.57731652, "memory(GiB)": 21.51, "elapsed_time": "8h 9m 16s", "remaining_time": "3h 27m 42s", "loss_scale": 1.0, "consumed_samples": 1143296, "global_step/max_steps": "4466/6362"} +{"lm loss": 4.87952709, "grad_norm": 0.40529692, "learning_rate": 2.469e-05, "elapsed_time_per_iteration": 6.82452846, "memory(GiB)": 21.51, "elapsed_time": "8h 9m 22s", "remaining_time": "3h 27m 36s", "loss_scale": 1.0, "consumed_samples": 1143552, "global_step/max_steps": "4467/6362"} +{"lm loss": 4.88254499, "grad_norm": 0.36023501, "learning_rate": 2.467e-05, "elapsed_time_per_iteration": 6.63030243, "memory(GiB)": 21.51, "elapsed_time": "8h 9m 29s", "remaining_time": "3h 27m 29s", "loss_scale": 1.0, "consumed_samples": 1143808, "global_step/max_steps": "4468/6362"} +{"lm loss": 4.8997426, "grad_norm": 0.37393805, "learning_rate": 2.464e-05, "elapsed_time_per_iteration": 6.48897672, "memory(GiB)": 21.51, "elapsed_time": "8h 9m 35s", "remaining_time": "3h 27m 23s", "loss_scale": 1.0, "consumed_samples": 1144064, "global_step/max_steps": "4469/6362"} +{"lm loss": 4.90294027, "grad_norm": 0.35693058, "learning_rate": 2.462e-05, "elapsed_time_per_iteration": 6.40729809, "memory(GiB)": 21.51, "elapsed_time": "8h 9m 42s", "remaining_time": "3h 27m 16s", "loss_scale": 1.0, "consumed_samples": 1144320, "global_step/max_steps": "4470/6362"} +{"lm loss": 4.87737799, "grad_norm": 0.38963825, "learning_rate": 2.46e-05, "elapsed_time_per_iteration": 6.62684011, "memory(GiB)": 21.51, "elapsed_time": "8h 9m 49s", "remaining_time": "3h 27m 10s", "loss_scale": 1.0, "consumed_samples": 1144576, "global_step/max_steps": "4471/6362"} +{"lm loss": 4.8964262, "grad_norm": 0.37135696, "learning_rate": 2.458e-05, "elapsed_time_per_iteration": 6.54402947, "memory(GiB)": 21.51, "elapsed_time": "8h 9m 55s", "remaining_time": "3h 27m 3s", "loss_scale": 1.0, "consumed_samples": 1144832, "global_step/max_steps": "4472/6362"} +{"lm loss": 4.88367558, "grad_norm": 0.3673656, "learning_rate": 2.456e-05, "elapsed_time_per_iteration": 6.61829686, "memory(GiB)": 21.51, "elapsed_time": "8h 10m 2s", "remaining_time": "3h 26m 56s", "loss_scale": 1.0, "consumed_samples": 1145088, "global_step/max_steps": "4473/6362"} +{"lm loss": 4.85911179, "grad_norm": 0.4007493, "learning_rate": 2.454e-05, "elapsed_time_per_iteration": 6.77371526, "memory(GiB)": 21.51, "elapsed_time": "8h 10m 8s", "remaining_time": "3h 26m 50s", "loss_scale": 1.0, "consumed_samples": 1145344, "global_step/max_steps": "4474/6362"} +{"lm loss": 4.89464569, "grad_norm": 0.38150263, "learning_rate": 2.452e-05, "elapsed_time_per_iteration": 6.44809079, "memory(GiB)": 21.51, "elapsed_time": "8h 10m 15s", "remaining_time": "3h 26m 43s", "loss_scale": 1.0, "consumed_samples": 1145600, "global_step/max_steps": "4475/6362"} +{"lm loss": 4.90440559, "grad_norm": 0.38790822, "learning_rate": 2.45e-05, "elapsed_time_per_iteration": 6.51918936, "memory(GiB)": 21.51, "elapsed_time": "8h 10m 21s", "remaining_time": "3h 26m 37s", "loss_scale": 1.0, "consumed_samples": 1145856, "global_step/max_steps": "4476/6362"} +{"lm loss": 4.90359783, "grad_norm": 0.38879251, "learning_rate": 2.448e-05, "elapsed_time_per_iteration": 6.44322109, "memory(GiB)": 21.51, "elapsed_time": "8h 10m 28s", "remaining_time": "3h 26m 30s", "loss_scale": 1.0, "consumed_samples": 1146112, "global_step/max_steps": "4477/6362"} +{"lm loss": 4.88533401, "grad_norm": 0.41452762, "learning_rate": 2.446e-05, "elapsed_time_per_iteration": 6.79900455, "memory(GiB)": 21.51, "elapsed_time": "8h 10m 35s", "remaining_time": "3h 26m 24s", "loss_scale": 1.0, "consumed_samples": 1146368, "global_step/max_steps": "4478/6362"} +{"lm loss": 4.88216543, "grad_norm": 0.3959775, "learning_rate": 2.444e-05, "elapsed_time_per_iteration": 6.48890924, "memory(GiB)": 21.51, "elapsed_time": "8h 10m 41s", "remaining_time": "3h 26m 17s", "loss_scale": 1.0, "consumed_samples": 1146624, "global_step/max_steps": "4479/6362"} +{"lm loss": 4.89406967, "grad_norm": 0.3820658, "learning_rate": 2.441e-05, "elapsed_time_per_iteration": 6.69758105, "memory(GiB)": 21.51, "elapsed_time": "8h 10m 48s", "remaining_time": "3h 26m 10s", "loss_scale": 1.0, "consumed_samples": 1146880, "global_step/max_steps": "4480/6362"} +{"lm loss": 4.86699104, "grad_norm": 0.40723673, "learning_rate": 2.439e-05, "elapsed_time_per_iteration": 6.5328908, "memory(GiB)": 21.51, "elapsed_time": "8h 10m 54s", "remaining_time": "3h 26m 4s", "loss_scale": 1.0, "consumed_samples": 1147136, "global_step/max_steps": "4481/6362"} +{"lm loss": 4.88286209, "grad_norm": 0.37384424, "learning_rate": 2.437e-05, "elapsed_time_per_iteration": 6.5178957, "memory(GiB)": 21.51, "elapsed_time": "8h 11m 1s", "remaining_time": "3h 25m 57s", "loss_scale": 1.0, "consumed_samples": 1147392, "global_step/max_steps": "4482/6362"} +{"lm loss": 4.87311077, "grad_norm": 0.40507138, "learning_rate": 2.435e-05, "elapsed_time_per_iteration": 6.75626683, "memory(GiB)": 21.51, "elapsed_time": "8h 11m 8s", "remaining_time": "3h 25m 51s", "loss_scale": 1.0, "consumed_samples": 1147648, "global_step/max_steps": "4483/6362"} +{"lm loss": 4.90447807, "grad_norm": 0.37440881, "learning_rate": 2.433e-05, "elapsed_time_per_iteration": 6.69281101, "memory(GiB)": 21.51, "elapsed_time": "8h 11m 14s", "remaining_time": "3h 25m 44s", "loss_scale": 1.0, "consumed_samples": 1147904, "global_step/max_steps": "4484/6362"} +{"lm loss": 4.87755775, "grad_norm": 0.38123721, "learning_rate": 2.431e-05, "elapsed_time_per_iteration": 6.59976673, "memory(GiB)": 21.51, "elapsed_time": "8h 11m 21s", "remaining_time": "3h 25m 38s", "loss_scale": 1.0, "consumed_samples": 1148160, "global_step/max_steps": "4485/6362"} +{"lm loss": 4.89250851, "grad_norm": 0.35837901, "learning_rate": 2.429e-05, "elapsed_time_per_iteration": 6.69869542, "memory(GiB)": 21.51, "elapsed_time": "8h 11m 28s", "remaining_time": "3h 25m 31s", "loss_scale": 1.0, "consumed_samples": 1148416, "global_step/max_steps": "4486/6362"} +{"lm loss": 4.87709332, "grad_norm": 0.38697764, "learning_rate": 2.427e-05, "elapsed_time_per_iteration": 6.75171185, "memory(GiB)": 21.51, "elapsed_time": "8h 11m 34s", "remaining_time": "3h 25m 25s", "loss_scale": 1.0, "consumed_samples": 1148672, "global_step/max_steps": "4487/6362"} +{"lm loss": 4.91043854, "grad_norm": 0.36862227, "learning_rate": 2.425e-05, "elapsed_time_per_iteration": 6.61618924, "memory(GiB)": 21.51, "elapsed_time": "8h 11m 41s", "remaining_time": "3h 25m 18s", "loss_scale": 1.0, "consumed_samples": 1148928, "global_step/max_steps": "4488/6362"} +{"lm loss": 4.90327787, "grad_norm": 0.3788707, "learning_rate": 2.423e-05, "elapsed_time_per_iteration": 6.51057363, "memory(GiB)": 21.51, "elapsed_time": "8h 11m 48s", "remaining_time": "3h 25m 11s", "loss_scale": 1.0, "consumed_samples": 1149184, "global_step/max_steps": "4489/6362"} +{"lm loss": 4.87464523, "grad_norm": 0.37628317, "learning_rate": 2.421e-05, "elapsed_time_per_iteration": 6.39738941, "memory(GiB)": 21.51, "elapsed_time": "8h 11m 54s", "remaining_time": "3h 25m 5s", "loss_scale": 1.0, "consumed_samples": 1149440, "global_step/max_steps": "4490/6362"} +{"lm loss": 4.90364456, "grad_norm": 0.38048685, "learning_rate": 2.418e-05, "elapsed_time_per_iteration": 6.37922406, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 0s", "remaining_time": "3h 24m 58s", "loss_scale": 1.0, "consumed_samples": 1149696, "global_step/max_steps": "4491/6362"} +{"lm loss": 4.87195158, "grad_norm": 0.35725185, "learning_rate": 2.416e-05, "elapsed_time_per_iteration": 6.36094618, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 7s", "remaining_time": "3h 24m 52s", "loss_scale": 1.0, "consumed_samples": 1149952, "global_step/max_steps": "4492/6362"} +{"lm loss": 4.86373377, "grad_norm": 0.36593562, "learning_rate": 2.414e-05, "elapsed_time_per_iteration": 6.61762714, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 13s", "remaining_time": "3h 24m 45s", "loss_scale": 1.0, "consumed_samples": 1150208, "global_step/max_steps": "4493/6362"} +{"lm loss": 4.88097382, "grad_norm": 0.39438814, "learning_rate": 2.412e-05, "elapsed_time_per_iteration": 6.51692867, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 20s", "remaining_time": "3h 24m 38s", "loss_scale": 1.0, "consumed_samples": 1150464, "global_step/max_steps": "4494/6362"} +{"lm loss": 4.87427521, "grad_norm": 0.35275593, "learning_rate": 2.41e-05, "elapsed_time_per_iteration": 6.65028524, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 26s", "remaining_time": "3h 24m 32s", "loss_scale": 1.0, "consumed_samples": 1150720, "global_step/max_steps": "4495/6362"} +{"lm loss": 4.87905645, "grad_norm": 0.38071245, "learning_rate": 2.408e-05, "elapsed_time_per_iteration": 6.44839597, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 33s", "remaining_time": "3h 24m 25s", "loss_scale": 1.0, "consumed_samples": 1150976, "global_step/max_steps": "4496/6362"} +{"lm loss": 4.88089943, "grad_norm": 0.36201522, "learning_rate": 2.406e-05, "elapsed_time_per_iteration": 6.64753437, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 40s", "remaining_time": "3h 24m 19s", "loss_scale": 1.0, "consumed_samples": 1151232, "global_step/max_steps": "4497/6362"} +{"lm loss": 4.89230633, "grad_norm": 0.35641178, "learning_rate": 2.404e-05, "elapsed_time_per_iteration": 6.56108856, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 46s", "remaining_time": "3h 24m 12s", "loss_scale": 1.0, "consumed_samples": 1151488, "global_step/max_steps": "4498/6362"} +{"lm loss": 4.89619303, "grad_norm": 0.35093331, "learning_rate": 2.402e-05, "elapsed_time_per_iteration": 6.58292747, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 53s", "remaining_time": "3h 24m 6s", "loss_scale": 1.0, "consumed_samples": 1151744, "global_step/max_steps": "4499/6362"} +{"lm loss": 4.88748741, "grad_norm": 0.36231196, "learning_rate": 2.4e-05, "elapsed_time_per_iteration": 6.76558042, "memory(GiB)": 21.51, "elapsed_time": "8h 12m 59s", "remaining_time": "3h 23m 59s", "loss_scale": 1.0, "consumed_samples": 1152000, "global_step/max_steps": "4500/6362"} +{"lm loss": 4.90448952, "grad_norm": 0.36596644, "learning_rate": 2.398e-05, "elapsed_time_per_iteration": 6.36664081, "memory(GiB)": 21.51, "elapsed_time": "8h 13m 6s", "remaining_time": "3h 23m 52s", "loss_scale": 1.0, "consumed_samples": 1152256, "global_step/max_steps": "4501/6362"} +{"lm loss": 4.86999798, "grad_norm": 0.38045061, "learning_rate": 2.396e-05, "elapsed_time_per_iteration": 6.67181444, "memory(GiB)": 21.51, "elapsed_time": "8h 13m 13s", "remaining_time": "3h 23m 46s", "loss_scale": 1.0, "consumed_samples": 1152512, "global_step/max_steps": "4502/6362"} +{"lm loss": 4.89163351, "grad_norm": 0.37085852, "learning_rate": 2.394e-05, "elapsed_time_per_iteration": 6.52775359, "memory(GiB)": 21.51, "elapsed_time": "8h 13m 19s", "remaining_time": "3h 23m 39s", "loss_scale": 1.0, "consumed_samples": 1152768, "global_step/max_steps": "4503/6362"} +{"lm loss": 4.88774824, "grad_norm": 0.37261686, "learning_rate": 2.391e-05, "elapsed_time_per_iteration": 6.64355874, "memory(GiB)": 21.51, "elapsed_time": "8h 13m 26s", "remaining_time": "3h 23m 33s", "loss_scale": 1.0, "consumed_samples": 1153024, "global_step/max_steps": "4504/6362"} +{"lm loss": 4.86739349, "grad_norm": 0.39101848, "learning_rate": 2.389e-05, "elapsed_time_per_iteration": 6.46518707, "memory(GiB)": 21.51, "elapsed_time": "8h 13m 32s", "remaining_time": "3h 23m 26s", "loss_scale": 1.0, "consumed_samples": 1153280, "global_step/max_steps": "4505/6362"} +{"lm loss": 4.87972832, "grad_norm": 0.39282885, "learning_rate": 2.387e-05, "elapsed_time_per_iteration": 7.14672017, "memory(GiB)": 21.51, "elapsed_time": "8h 13m 39s", "remaining_time": "3h 23m 20s", "loss_scale": 1.0, "consumed_samples": 1153536, "global_step/max_steps": "4506/6362"} +{"lm loss": 4.86680174, "grad_norm": 0.39846799, "learning_rate": 2.385e-05, "elapsed_time_per_iteration": 6.52297425, "memory(GiB)": 21.51, "elapsed_time": "8h 13m 46s", "remaining_time": "3h 23m 13s", "loss_scale": 1.0, "consumed_samples": 1153792, "global_step/max_steps": "4507/6362"} +{"lm loss": 4.87126923, "grad_norm": 0.35064903, "learning_rate": 2.383e-05, "elapsed_time_per_iteration": 6.62986922, "memory(GiB)": 21.51, "elapsed_time": "8h 13m 52s", "remaining_time": "3h 23m 7s", "loss_scale": 1.0, "consumed_samples": 1154048, "global_step/max_steps": "4508/6362"} +{"lm loss": 4.89281464, "grad_norm": 0.35166192, "learning_rate": 2.381e-05, "elapsed_time_per_iteration": 6.50174069, "memory(GiB)": 21.51, "elapsed_time": "8h 13m 59s", "remaining_time": "3h 23m 0s", "loss_scale": 1.0, "consumed_samples": 1154304, "global_step/max_steps": "4509/6362"} +{"lm loss": 4.86758518, "grad_norm": 0.37358993, "learning_rate": 2.379e-05, "elapsed_time_per_iteration": 6.43824077, "memory(GiB)": 21.51, "elapsed_time": "8h 14m 5s", "remaining_time": "3h 22m 53s", "loss_scale": 1.0, "consumed_samples": 1154560, "global_step/max_steps": "4510/6362"} +{"lm loss": 4.8880291, "grad_norm": 0.40827066, "learning_rate": 2.377e-05, "elapsed_time_per_iteration": 6.48760462, "memory(GiB)": 21.51, "elapsed_time": "8h 14m 12s", "remaining_time": "3h 22m 47s", "loss_scale": 1.0, "consumed_samples": 1154816, "global_step/max_steps": "4511/6362"} +{"lm loss": 4.9233284, "grad_norm": 0.40695453, "learning_rate": 2.375e-05, "elapsed_time_per_iteration": 6.5902977, "memory(GiB)": 21.51, "elapsed_time": "8h 14m 18s", "remaining_time": "3h 22m 40s", "loss_scale": 1.0, "consumed_samples": 1155072, "global_step/max_steps": "4512/6362"} +{"lm loss": 4.90083313, "grad_norm": 0.39369509, "learning_rate": 2.373e-05, "elapsed_time_per_iteration": 6.38545966, "memory(GiB)": 21.51, "elapsed_time": "8h 14m 25s", "remaining_time": "3h 22m 34s", "loss_scale": 1.0, "consumed_samples": 1155328, "global_step/max_steps": "4513/6362"} +{"lm loss": 4.91085672, "grad_norm": 0.37532333, "learning_rate": 2.371e-05, "elapsed_time_per_iteration": 6.5156405, "memory(GiB)": 21.51, "elapsed_time": "8h 14m 31s", "remaining_time": "3h 22m 27s", "loss_scale": 1.0, "consumed_samples": 1155584, "global_step/max_steps": "4514/6362"} +{"lm loss": 4.88877916, "grad_norm": 0.36616164, "learning_rate": 2.369e-05, "elapsed_time_per_iteration": 6.47062397, "memory(GiB)": 21.51, "elapsed_time": "8h 14m 38s", "remaining_time": "3h 22m 20s", "loss_scale": 1.0, "consumed_samples": 1155840, "global_step/max_steps": "4515/6362"} +{"lm loss": 4.88931894, "grad_norm": 0.37534401, "learning_rate": 2.367e-05, "elapsed_time_per_iteration": 6.53546858, "memory(GiB)": 21.51, "elapsed_time": "8h 14m 44s", "remaining_time": "3h 22m 14s", "loss_scale": 1.0, "consumed_samples": 1156096, "global_step/max_steps": "4516/6362"} +{"lm loss": 4.88889456, "grad_norm": 0.36524928, "learning_rate": 2.365e-05, "elapsed_time_per_iteration": 6.55045199, "memory(GiB)": 21.51, "elapsed_time": "8h 14m 51s", "remaining_time": "3h 22m 7s", "loss_scale": 1.0, "consumed_samples": 1156352, "global_step/max_steps": "4517/6362"} +{"lm loss": 4.89867783, "grad_norm": 0.35317135, "learning_rate": 2.362e-05, "elapsed_time_per_iteration": 6.61735201, "memory(GiB)": 21.51, "elapsed_time": "8h 14m 58s", "remaining_time": "3h 22m 1s", "loss_scale": 1.0, "consumed_samples": 1156608, "global_step/max_steps": "4518/6362"} +{"lm loss": 4.89554071, "grad_norm": 0.41352499, "learning_rate": 2.36e-05, "elapsed_time_per_iteration": 6.46096873, "memory(GiB)": 21.51, "elapsed_time": "8h 15m 4s", "remaining_time": "3h 21m 54s", "loss_scale": 1.0, "consumed_samples": 1156864, "global_step/max_steps": "4519/6362"} +{"lm loss": 4.9052186, "grad_norm": 0.36049011, "learning_rate": 2.358e-05, "elapsed_time_per_iteration": 6.62409902, "memory(GiB)": 21.51, "elapsed_time": "8h 15m 11s", "remaining_time": "3h 21m 47s", "loss_scale": 1.0, "consumed_samples": 1157120, "global_step/max_steps": "4520/6362"} +{"lm loss": 4.89211559, "grad_norm": 0.4134993, "learning_rate": 2.356e-05, "elapsed_time_per_iteration": 6.76215315, "memory(GiB)": 21.51, "elapsed_time": "8h 15m 17s", "remaining_time": "3h 21m 41s", "loss_scale": 1.0, "consumed_samples": 1157376, "global_step/max_steps": "4521/6362"} +{"lm loss": 4.87901211, "grad_norm": 0.40723205, "learning_rate": 2.354e-05, "elapsed_time_per_iteration": 6.72541308, "memory(GiB)": 21.51, "elapsed_time": "8h 15m 24s", "remaining_time": "3h 21m 34s", "loss_scale": 1.0, "consumed_samples": 1157632, "global_step/max_steps": "4522/6362"} +{"lm loss": 4.89600039, "grad_norm": 0.36705476, "learning_rate": 2.352e-05, "elapsed_time_per_iteration": 6.74386668, "memory(GiB)": 21.51, "elapsed_time": "8h 15m 31s", "remaining_time": "3h 21m 28s", "loss_scale": 1.0, "consumed_samples": 1157888, "global_step/max_steps": "4523/6362"} +{"lm loss": 4.8969841, "grad_norm": 0.43738025, "learning_rate": 2.35e-05, "elapsed_time_per_iteration": 6.74839211, "memory(GiB)": 21.51, "elapsed_time": "8h 15m 38s", "remaining_time": "3h 21m 21s", "loss_scale": 1.0, "consumed_samples": 1158144, "global_step/max_steps": "4524/6362"} +{"lm loss": 4.87879801, "grad_norm": 0.34732792, "learning_rate": 2.348e-05, "elapsed_time_per_iteration": 6.47980022, "memory(GiB)": 21.51, "elapsed_time": "8h 15m 44s", "remaining_time": "3h 21m 15s", "loss_scale": 1.0, "consumed_samples": 1158400, "global_step/max_steps": "4525/6362"} +{"lm loss": 4.90241432, "grad_norm": 0.43529209, "learning_rate": 2.346e-05, "elapsed_time_per_iteration": 6.66621733, "memory(GiB)": 21.51, "elapsed_time": "8h 15m 51s", "remaining_time": "3h 21m 8s", "loss_scale": 1.0, "consumed_samples": 1158656, "global_step/max_steps": "4526/6362"} +{"lm loss": 4.86805916, "grad_norm": 0.40773064, "learning_rate": 2.344e-05, "elapsed_time_per_iteration": 6.55524492, "memory(GiB)": 21.51, "elapsed_time": "8h 15m 57s", "remaining_time": "3h 21m 2s", "loss_scale": 1.0, "consumed_samples": 1158912, "global_step/max_steps": "4527/6362"} +{"lm loss": 4.90297747, "grad_norm": 0.36845875, "learning_rate": 2.342e-05, "elapsed_time_per_iteration": 6.69014335, "memory(GiB)": 21.51, "elapsed_time": "8h 16m 4s", "remaining_time": "3h 20m 55s", "loss_scale": 1.0, "consumed_samples": 1159168, "global_step/max_steps": "4528/6362"} +{"lm loss": 4.88994598, "grad_norm": 0.39224926, "learning_rate": 2.34e-05, "elapsed_time_per_iteration": 6.46603036, "memory(GiB)": 21.51, "elapsed_time": "8h 16m 10s", "remaining_time": "3h 20m 49s", "loss_scale": 1.0, "consumed_samples": 1159424, "global_step/max_steps": "4529/6362"} +{"lm loss": 4.87447071, "grad_norm": 0.40120783, "learning_rate": 2.338e-05, "elapsed_time_per_iteration": 6.55538487, "memory(GiB)": 21.51, "elapsed_time": "8h 16m 17s", "remaining_time": "3h 20m 42s", "loss_scale": 1.0, "consumed_samples": 1159680, "global_step/max_steps": "4530/6362"} +{"lm loss": 4.85871172, "grad_norm": 0.36755255, "learning_rate": 2.336e-05, "elapsed_time_per_iteration": 6.56402445, "memory(GiB)": 21.51, "elapsed_time": "8h 16m 24s", "remaining_time": "3h 20m 35s", "loss_scale": 1.0, "consumed_samples": 1159936, "global_step/max_steps": "4531/6362"} +{"lm loss": 4.89381361, "grad_norm": 0.39564884, "learning_rate": 2.334e-05, "elapsed_time_per_iteration": 6.65103507, "memory(GiB)": 21.51, "elapsed_time": "8h 16m 30s", "remaining_time": "3h 20m 29s", "loss_scale": 1.0, "consumed_samples": 1160192, "global_step/max_steps": "4532/6362"} +{"lm loss": 4.88378334, "grad_norm": 0.34283328, "learning_rate": 2.332e-05, "elapsed_time_per_iteration": 6.46830201, "memory(GiB)": 21.51, "elapsed_time": "8h 16m 37s", "remaining_time": "3h 20m 22s", "loss_scale": 1.0, "consumed_samples": 1160448, "global_step/max_steps": "4533/6362"} +{"lm loss": 4.87581444, "grad_norm": 0.37889016, "learning_rate": 2.33e-05, "elapsed_time_per_iteration": 6.53637147, "memory(GiB)": 21.51, "elapsed_time": "8h 16m 43s", "remaining_time": "3h 20m 16s", "loss_scale": 1.0, "consumed_samples": 1160704, "global_step/max_steps": "4534/6362"} +{"lm loss": 4.88805342, "grad_norm": 0.33939481, "learning_rate": 2.328e-05, "elapsed_time_per_iteration": 6.73908544, "memory(GiB)": 21.51, "elapsed_time": "8h 16m 50s", "remaining_time": "3h 20m 9s", "loss_scale": 1.0, "consumed_samples": 1160960, "global_step/max_steps": "4535/6362"} +{"lm loss": 4.85795021, "grad_norm": 0.34028652, "learning_rate": 2.325e-05, "elapsed_time_per_iteration": 6.65479088, "memory(GiB)": 21.51, "elapsed_time": "8h 16m 57s", "remaining_time": "3h 20m 3s", "loss_scale": 1.0, "consumed_samples": 1161216, "global_step/max_steps": "4536/6362"} +{"lm loss": 4.88795042, "grad_norm": 0.387566, "learning_rate": 2.323e-05, "elapsed_time_per_iteration": 6.54491425, "memory(GiB)": 21.51, "elapsed_time": "8h 17m 3s", "remaining_time": "3h 19m 56s", "loss_scale": 1.0, "consumed_samples": 1161472, "global_step/max_steps": "4537/6362"} +{"lm loss": 4.87650156, "grad_norm": 0.36236706, "learning_rate": 2.321e-05, "elapsed_time_per_iteration": 6.47056365, "memory(GiB)": 21.51, "elapsed_time": "8h 17m 10s", "remaining_time": "3h 19m 49s", "loss_scale": 1.0, "consumed_samples": 1161728, "global_step/max_steps": "4538/6362"} +{"lm loss": 4.89147282, "grad_norm": 0.38338, "learning_rate": 2.319e-05, "elapsed_time_per_iteration": 6.56199837, "memory(GiB)": 21.51, "elapsed_time": "8h 17m 16s", "remaining_time": "3h 19m 43s", "loss_scale": 1.0, "consumed_samples": 1161984, "global_step/max_steps": "4539/6362"} +{"lm loss": 4.87149715, "grad_norm": 0.37902007, "learning_rate": 2.317e-05, "elapsed_time_per_iteration": 6.63891053, "memory(GiB)": 21.51, "elapsed_time": "8h 17m 23s", "remaining_time": "3h 19m 36s", "loss_scale": 1.0, "consumed_samples": 1162240, "global_step/max_steps": "4540/6362"} +{"lm loss": 4.89074039, "grad_norm": 0.35619536, "learning_rate": 2.315e-05, "elapsed_time_per_iteration": 6.60995793, "memory(GiB)": 21.51, "elapsed_time": "8h 17m 29s", "remaining_time": "3h 19m 30s", "loss_scale": 1.0, "consumed_samples": 1162496, "global_step/max_steps": "4541/6362"} +{"lm loss": 4.9160924, "grad_norm": 0.39057478, "learning_rate": 2.313e-05, "elapsed_time_per_iteration": 6.71281219, "memory(GiB)": 21.51, "elapsed_time": "8h 17m 36s", "remaining_time": "3h 19m 23s", "loss_scale": 1.0, "consumed_samples": 1162752, "global_step/max_steps": "4542/6362"} +{"lm loss": 4.89899731, "grad_norm": 0.35577297, "learning_rate": 2.311e-05, "elapsed_time_per_iteration": 6.50400853, "memory(GiB)": 21.51, "elapsed_time": "8h 17m 43s", "remaining_time": "3h 19m 17s", "loss_scale": 1.0, "consumed_samples": 1163008, "global_step/max_steps": "4543/6362"} +{"lm loss": 4.89703178, "grad_norm": 0.38077241, "learning_rate": 2.309e-05, "elapsed_time_per_iteration": 6.51859975, "memory(GiB)": 21.51, "elapsed_time": "8h 17m 49s", "remaining_time": "3h 19m 10s", "loss_scale": 1.0, "consumed_samples": 1163264, "global_step/max_steps": "4544/6362"} +{"lm loss": 4.89609098, "grad_norm": 0.36284921, "learning_rate": 2.307e-05, "elapsed_time_per_iteration": 6.52579594, "memory(GiB)": 21.51, "elapsed_time": "8h 17m 56s", "remaining_time": "3h 19m 3s", "loss_scale": 1.0, "consumed_samples": 1163520, "global_step/max_steps": "4545/6362"} +{"lm loss": 4.8817215, "grad_norm": 0.36236712, "learning_rate": 2.305e-05, "elapsed_time_per_iteration": 6.88410592, "memory(GiB)": 21.51, "elapsed_time": "8h 18m 3s", "remaining_time": "3h 18m 57s", "loss_scale": 1.0, "consumed_samples": 1163776, "global_step/max_steps": "4546/6362"} +{"lm loss": 4.86796141, "grad_norm": 0.38523838, "learning_rate": 2.303e-05, "elapsed_time_per_iteration": 6.56381607, "memory(GiB)": 21.51, "elapsed_time": "8h 18m 9s", "remaining_time": "3h 18m 50s", "loss_scale": 1.0, "consumed_samples": 1164032, "global_step/max_steps": "4547/6362"} +{"lm loss": 4.89323473, "grad_norm": 0.34728277, "learning_rate": 2.301e-05, "elapsed_time_per_iteration": 6.44902396, "memory(GiB)": 21.51, "elapsed_time": "8h 18m 16s", "remaining_time": "3h 18m 44s", "loss_scale": 1.0, "consumed_samples": 1164288, "global_step/max_steps": "4548/6362"} +{"lm loss": 4.90643024, "grad_norm": 0.35662824, "learning_rate": 2.299e-05, "elapsed_time_per_iteration": 6.58883762, "memory(GiB)": 21.51, "elapsed_time": "8h 18m 22s", "remaining_time": "3h 18m 37s", "loss_scale": 1.0, "consumed_samples": 1164544, "global_step/max_steps": "4549/6362"} +{"lm loss": 4.88648462, "grad_norm": 0.34410438, "learning_rate": 2.297e-05, "elapsed_time_per_iteration": 6.47731495, "memory(GiB)": 21.51, "elapsed_time": "8h 18m 29s", "remaining_time": "3h 18m 31s", "loss_scale": 1.0, "consumed_samples": 1164800, "global_step/max_steps": "4550/6362"} +{"lm loss": 4.85777092, "grad_norm": 0.38056511, "learning_rate": 2.295e-05, "elapsed_time_per_iteration": 6.5200274, "memory(GiB)": 21.51, "elapsed_time": "8h 18m 35s", "remaining_time": "3h 18m 24s", "loss_scale": 1.0, "consumed_samples": 1165056, "global_step/max_steps": "4551/6362"} +{"lm loss": 4.87257338, "grad_norm": 0.33992776, "learning_rate": 2.293e-05, "elapsed_time_per_iteration": 6.63535023, "memory(GiB)": 21.51, "elapsed_time": "8h 18m 42s", "remaining_time": "3h 18m 17s", "loss_scale": 1.0, "consumed_samples": 1165312, "global_step/max_steps": "4552/6362"} +{"lm loss": 4.87112808, "grad_norm": 0.36406583, "learning_rate": 2.291e-05, "elapsed_time_per_iteration": 6.82352996, "memory(GiB)": 21.51, "elapsed_time": "8h 18m 49s", "remaining_time": "3h 18m 11s", "loss_scale": 1.0, "consumed_samples": 1165568, "global_step/max_steps": "4553/6362"} +{"lm loss": 4.88555002, "grad_norm": 0.37727728, "learning_rate": 2.289e-05, "elapsed_time_per_iteration": 6.49845624, "memory(GiB)": 21.51, "elapsed_time": "8h 18m 55s", "remaining_time": "3h 18m 4s", "loss_scale": 1.0, "consumed_samples": 1165824, "global_step/max_steps": "4554/6362"} +{"lm loss": 4.89590454, "grad_norm": 0.3402687, "learning_rate": 2.287e-05, "elapsed_time_per_iteration": 6.75137758, "memory(GiB)": 21.51, "elapsed_time": "8h 19m 2s", "remaining_time": "3h 17m 58s", "loss_scale": 1.0, "consumed_samples": 1166080, "global_step/max_steps": "4555/6362"} +{"lm loss": 4.87249517, "grad_norm": 0.34661815, "learning_rate": 2.285e-05, "elapsed_time_per_iteration": 6.4742341, "memory(GiB)": 21.51, "elapsed_time": "8h 19m 8s", "remaining_time": "3h 17m 51s", "loss_scale": 1.0, "consumed_samples": 1166336, "global_step/max_steps": "4556/6362"} +{"lm loss": 4.90665054, "grad_norm": 0.33259961, "learning_rate": 2.283e-05, "elapsed_time_per_iteration": 6.72417521, "memory(GiB)": 21.51, "elapsed_time": "8h 19m 15s", "remaining_time": "3h 17m 45s", "loss_scale": 1.0, "consumed_samples": 1166592, "global_step/max_steps": "4557/6362"} +{"lm loss": 4.88291645, "grad_norm": 0.38612574, "learning_rate": 2.281e-05, "elapsed_time_per_iteration": 6.49523568, "memory(GiB)": 21.51, "elapsed_time": "8h 19m 22s", "remaining_time": "3h 17m 38s", "loss_scale": 1.0, "consumed_samples": 1166848, "global_step/max_steps": "4558/6362"} +{"lm loss": 4.872437, "grad_norm": 0.35771149, "learning_rate": 2.279e-05, "elapsed_time_per_iteration": 6.72859859, "memory(GiB)": 21.51, "elapsed_time": "8h 19m 28s", "remaining_time": "3h 17m 32s", "loss_scale": 1.0, "consumed_samples": 1167104, "global_step/max_steps": "4559/6362"} +{"lm loss": 4.87777233, "grad_norm": 0.34546059, "learning_rate": 2.277e-05, "elapsed_time_per_iteration": 6.6246829, "memory(GiB)": 21.51, "elapsed_time": "8h 19m 35s", "remaining_time": "3h 17m 25s", "loss_scale": 1.0, "consumed_samples": 1167360, "global_step/max_steps": "4560/6362"} +{"lm loss": 4.86202908, "grad_norm": 0.39187062, "learning_rate": 2.274e-05, "elapsed_time_per_iteration": 6.55228972, "memory(GiB)": 21.51, "elapsed_time": "8h 19m 41s", "remaining_time": "3h 17m 18s", "loss_scale": 1.0, "consumed_samples": 1167616, "global_step/max_steps": "4561/6362"} +{"lm loss": 4.88457584, "grad_norm": 0.36669177, "learning_rate": 2.272e-05, "elapsed_time_per_iteration": 6.66892505, "memory(GiB)": 21.51, "elapsed_time": "8h 19m 48s", "remaining_time": "3h 17m 12s", "loss_scale": 1.0, "consumed_samples": 1167872, "global_step/max_steps": "4562/6362"} +{"lm loss": 4.87087536, "grad_norm": 0.37813517, "learning_rate": 2.27e-05, "elapsed_time_per_iteration": 6.77355051, "memory(GiB)": 21.51, "elapsed_time": "8h 19m 55s", "remaining_time": "3h 17m 5s", "loss_scale": 1.0, "consumed_samples": 1168128, "global_step/max_steps": "4563/6362"} +{"lm loss": 4.90065479, "grad_norm": 0.37380746, "learning_rate": 2.268e-05, "elapsed_time_per_iteration": 6.53512049, "memory(GiB)": 21.51, "elapsed_time": "8h 20m 1s", "remaining_time": "3h 16m 59s", "loss_scale": 1.0, "consumed_samples": 1168384, "global_step/max_steps": "4564/6362"} +{"lm loss": 4.87357521, "grad_norm": 0.36932397, "learning_rate": 2.266e-05, "elapsed_time_per_iteration": 6.58553362, "memory(GiB)": 21.51, "elapsed_time": "8h 20m 8s", "remaining_time": "3h 16m 52s", "loss_scale": 1.0, "consumed_samples": 1168640, "global_step/max_steps": "4565/6362"} +{"lm loss": 4.86723232, "grad_norm": 0.43327293, "learning_rate": 2.264e-05, "elapsed_time_per_iteration": 6.75580549, "memory(GiB)": 21.51, "elapsed_time": "8h 20m 15s", "remaining_time": "3h 16m 46s", "loss_scale": 1.0, "consumed_samples": 1168896, "global_step/max_steps": "4566/6362"} +{"lm loss": 4.88219357, "grad_norm": 0.36543235, "learning_rate": 2.262e-05, "elapsed_time_per_iteration": 6.44978404, "memory(GiB)": 21.51, "elapsed_time": "8h 20m 21s", "remaining_time": "3h 16m 39s", "loss_scale": 1.0, "consumed_samples": 1169152, "global_step/max_steps": "4567/6362"} +{"lm loss": 4.90549278, "grad_norm": 0.37085551, "learning_rate": 2.26e-05, "elapsed_time_per_iteration": 6.46939635, "memory(GiB)": 21.51, "elapsed_time": "8h 20m 28s", "remaining_time": "3h 16m 33s", "loss_scale": 1.0, "consumed_samples": 1169408, "global_step/max_steps": "4568/6362"} +{"lm loss": 4.87978649, "grad_norm": 0.35448992, "learning_rate": 2.258e-05, "elapsed_time_per_iteration": 6.61299229, "memory(GiB)": 21.51, "elapsed_time": "8h 20m 34s", "remaining_time": "3h 16m 26s", "loss_scale": 1.0, "consumed_samples": 1169664, "global_step/max_steps": "4569/6362"} +{"lm loss": 4.88254118, "grad_norm": 0.37803701, "learning_rate": 2.256e-05, "elapsed_time_per_iteration": 6.67844939, "memory(GiB)": 21.51, "elapsed_time": "8h 20m 41s", "remaining_time": "3h 16m 19s", "loss_scale": 1.0, "consumed_samples": 1169920, "global_step/max_steps": "4570/6362"} +{"lm loss": 4.89789677, "grad_norm": 0.36177626, "learning_rate": 2.254e-05, "elapsed_time_per_iteration": 6.71597004, "memory(GiB)": 21.51, "elapsed_time": "8h 20m 48s", "remaining_time": "3h 16m 13s", "loss_scale": 1.0, "consumed_samples": 1170176, "global_step/max_steps": "4571/6362"} +{"lm loss": 4.8816576, "grad_norm": 0.36452514, "learning_rate": 2.252e-05, "elapsed_time_per_iteration": 6.46277261, "memory(GiB)": 21.51, "elapsed_time": "8h 20m 54s", "remaining_time": "3h 16m 6s", "loss_scale": 1.0, "consumed_samples": 1170432, "global_step/max_steps": "4572/6362"} +{"lm loss": 4.88946056, "grad_norm": 0.40213922, "learning_rate": 2.25e-05, "elapsed_time_per_iteration": 6.54590893, "memory(GiB)": 21.51, "elapsed_time": "8h 21m 1s", "remaining_time": "3h 16m 0s", "loss_scale": 1.0, "consumed_samples": 1170688, "global_step/max_steps": "4573/6362"} +{"lm loss": 4.91931534, "grad_norm": 0.36288121, "learning_rate": 2.248e-05, "elapsed_time_per_iteration": 6.59648681, "memory(GiB)": 21.51, "elapsed_time": "8h 21m 7s", "remaining_time": "3h 15m 53s", "loss_scale": 1.0, "consumed_samples": 1170944, "global_step/max_steps": "4574/6362"} +{"lm loss": 4.88753986, "grad_norm": 0.44002175, "learning_rate": 2.246e-05, "elapsed_time_per_iteration": 6.58968687, "memory(GiB)": 21.51, "elapsed_time": "8h 21m 14s", "remaining_time": "3h 15m 47s", "loss_scale": 1.0, "consumed_samples": 1171200, "global_step/max_steps": "4575/6362"} +{"lm loss": 4.90704679, "grad_norm": 0.36332542, "learning_rate": 2.244e-05, "elapsed_time_per_iteration": 6.7355144, "memory(GiB)": 21.51, "elapsed_time": "8h 21m 21s", "remaining_time": "3h 15m 40s", "loss_scale": 1.0, "consumed_samples": 1171456, "global_step/max_steps": "4576/6362"} +{"lm loss": 4.89946365, "grad_norm": 0.42102849, "learning_rate": 2.242e-05, "elapsed_time_per_iteration": 6.45934272, "memory(GiB)": 21.51, "elapsed_time": "8h 21m 27s", "remaining_time": "3h 15m 33s", "loss_scale": 1.0, "consumed_samples": 1171712, "global_step/max_steps": "4577/6362"} +{"lm loss": 4.86716795, "grad_norm": 0.36699602, "learning_rate": 2.24e-05, "elapsed_time_per_iteration": 6.3842144, "memory(GiB)": 21.51, "elapsed_time": "8h 21m 34s", "remaining_time": "3h 15m 27s", "loss_scale": 1.0, "consumed_samples": 1171968, "global_step/max_steps": "4578/6362"} +{"lm loss": 4.85351229, "grad_norm": 0.42236048, "learning_rate": 2.238e-05, "elapsed_time_per_iteration": 6.57919717, "memory(GiB)": 21.51, "elapsed_time": "8h 21m 40s", "remaining_time": "3h 15m 20s", "loss_scale": 1.0, "consumed_samples": 1172224, "global_step/max_steps": "4579/6362"} +{"lm loss": 4.87812376, "grad_norm": 0.3347666, "learning_rate": 2.236e-05, "elapsed_time_per_iteration": 6.64225602, "memory(GiB)": 21.51, "elapsed_time": "8h 21m 47s", "remaining_time": "3h 15m 14s", "loss_scale": 1.0, "consumed_samples": 1172480, "global_step/max_steps": "4580/6362"} +{"lm loss": 4.8713479, "grad_norm": 0.41911161, "learning_rate": 2.234e-05, "elapsed_time_per_iteration": 6.54091763, "memory(GiB)": 21.51, "elapsed_time": "8h 21m 53s", "remaining_time": "3h 15m 7s", "loss_scale": 1.0, "consumed_samples": 1172736, "global_step/max_steps": "4581/6362"} +{"lm loss": 4.88807392, "grad_norm": 0.3431026, "learning_rate": 2.232e-05, "elapsed_time_per_iteration": 6.54257035, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 0s", "remaining_time": "3h 15m 1s", "loss_scale": 1.0, "consumed_samples": 1172992, "global_step/max_steps": "4582/6362"} +{"lm loss": 4.87108088, "grad_norm": 0.41631749, "learning_rate": 2.23e-05, "elapsed_time_per_iteration": 6.72234488, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 7s", "remaining_time": "3h 14m 54s", "loss_scale": 1.0, "consumed_samples": 1173248, "global_step/max_steps": "4583/6362"} +{"lm loss": 4.86932468, "grad_norm": 0.38200459, "learning_rate": 2.228e-05, "elapsed_time_per_iteration": 6.56973267, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 13s", "remaining_time": "3h 14m 47s", "loss_scale": 1.0, "consumed_samples": 1173504, "global_step/max_steps": "4584/6362"} +{"lm loss": 4.87433243, "grad_norm": 0.42609608, "learning_rate": 2.226e-05, "elapsed_time_per_iteration": 6.4389956, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 20s", "remaining_time": "3h 14m 41s", "loss_scale": 1.0, "consumed_samples": 1173760, "global_step/max_steps": "4585/6362"} +{"lm loss": 4.88572693, "grad_norm": 0.33906025, "learning_rate": 2.224e-05, "elapsed_time_per_iteration": 6.48450637, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 26s", "remaining_time": "3h 14m 34s", "loss_scale": 1.0, "consumed_samples": 1174016, "global_step/max_steps": "4586/6362"} +{"lm loss": 4.87081575, "grad_norm": 0.37771615, "learning_rate": 2.222e-05, "elapsed_time_per_iteration": 6.26936984, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 32s", "remaining_time": "3h 14m 28s", "loss_scale": 1.0, "consumed_samples": 1174272, "global_step/max_steps": "4587/6362"} +{"lm loss": 4.86487389, "grad_norm": 0.36496836, "learning_rate": 2.22e-05, "elapsed_time_per_iteration": 6.46530008, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 39s", "remaining_time": "3h 14m 21s", "loss_scale": 1.0, "consumed_samples": 1174528, "global_step/max_steps": "4588/6362"} +{"lm loss": 4.89154005, "grad_norm": 0.38665047, "learning_rate": 2.218e-05, "elapsed_time_per_iteration": 6.58385682, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 45s", "remaining_time": "3h 14m 14s", "loss_scale": 1.0, "consumed_samples": 1174784, "global_step/max_steps": "4589/6362"} +{"lm loss": 4.91194868, "grad_norm": 0.36824232, "learning_rate": 2.216e-05, "elapsed_time_per_iteration": 6.41486812, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 52s", "remaining_time": "3h 14m 8s", "loss_scale": 1.0, "consumed_samples": 1175040, "global_step/max_steps": "4590/6362"} +{"lm loss": 4.89750957, "grad_norm": 0.37128556, "learning_rate": 2.214e-05, "elapsed_time_per_iteration": 6.79331994, "memory(GiB)": 21.51, "elapsed_time": "8h 22m 59s", "remaining_time": "3h 14m 1s", "loss_scale": 1.0, "consumed_samples": 1175296, "global_step/max_steps": "4591/6362"} +{"lm loss": 4.87696743, "grad_norm": 0.42238796, "learning_rate": 2.212e-05, "elapsed_time_per_iteration": 6.55876851, "memory(GiB)": 21.51, "elapsed_time": "8h 23m 5s", "remaining_time": "3h 13m 55s", "loss_scale": 1.0, "consumed_samples": 1175552, "global_step/max_steps": "4592/6362"} +{"lm loss": 4.89444733, "grad_norm": 0.3865141, "learning_rate": 2.21e-05, "elapsed_time_per_iteration": 6.50525904, "memory(GiB)": 21.51, "elapsed_time": "8h 23m 12s", "remaining_time": "3h 13m 48s", "loss_scale": 1.0, "consumed_samples": 1175808, "global_step/max_steps": "4593/6362"} +{"lm loss": 4.89203358, "grad_norm": 0.40122125, "learning_rate": 2.208e-05, "elapsed_time_per_iteration": 6.54246497, "memory(GiB)": 21.51, "elapsed_time": "8h 23m 18s", "remaining_time": "3h 13m 41s", "loss_scale": 1.0, "consumed_samples": 1176064, "global_step/max_steps": "4594/6362"} +{"lm loss": 4.86790133, "grad_norm": 0.37703305, "learning_rate": 2.206e-05, "elapsed_time_per_iteration": 6.66282105, "memory(GiB)": 21.51, "elapsed_time": "8h 23m 25s", "remaining_time": "3h 13m 35s", "loss_scale": 1.0, "consumed_samples": 1176320, "global_step/max_steps": "4595/6362"} +{"lm loss": 4.88627052, "grad_norm": 0.38552579, "learning_rate": 2.204e-05, "elapsed_time_per_iteration": 6.41228652, "memory(GiB)": 21.51, "elapsed_time": "8h 23m 31s", "remaining_time": "3h 13m 28s", "loss_scale": 1.0, "consumed_samples": 1176576, "global_step/max_steps": "4596/6362"} +{"lm loss": 4.89290142, "grad_norm": 0.41564965, "learning_rate": 2.202e-05, "elapsed_time_per_iteration": 6.73149514, "memory(GiB)": 21.51, "elapsed_time": "8h 23m 38s", "remaining_time": "3h 13m 22s", "loss_scale": 1.0, "consumed_samples": 1176832, "global_step/max_steps": "4597/6362"} +{"lm loss": 4.89749479, "grad_norm": 0.36706221, "learning_rate": 2.2e-05, "elapsed_time_per_iteration": 6.57506323, "memory(GiB)": 21.51, "elapsed_time": "8h 23m 45s", "remaining_time": "3h 13m 15s", "loss_scale": 1.0, "consumed_samples": 1177088, "global_step/max_steps": "4598/6362"} +{"lm loss": 4.88426161, "grad_norm": 0.40162376, "learning_rate": 2.198e-05, "elapsed_time_per_iteration": 6.65119767, "memory(GiB)": 21.51, "elapsed_time": "8h 23m 51s", "remaining_time": "3h 13m 9s", "loss_scale": 1.0, "consumed_samples": 1177344, "global_step/max_steps": "4599/6362"} +{"lm loss": 4.91621828, "grad_norm": 0.3695372, "learning_rate": 2.196e-05, "elapsed_time_per_iteration": 6.50337958, "memory(GiB)": 21.51, "elapsed_time": "8h 23m 58s", "remaining_time": "3h 13m 2s", "loss_scale": 1.0, "consumed_samples": 1177600, "global_step/max_steps": "4600/6362"} +{"lm loss": 4.86864138, "grad_norm": 0.39928484, "learning_rate": 2.194e-05, "elapsed_time_per_iteration": 6.5215857, "memory(GiB)": 21.51, "elapsed_time": "8h 24m 4s", "remaining_time": "3h 12m 55s", "loss_scale": 1.0, "consumed_samples": 1177856, "global_step/max_steps": "4601/6362"} +{"lm loss": 4.898283, "grad_norm": 0.37981319, "learning_rate": 2.192e-05, "elapsed_time_per_iteration": 6.61661291, "memory(GiB)": 21.51, "elapsed_time": "8h 24m 11s", "remaining_time": "3h 12m 49s", "loss_scale": 1.0, "consumed_samples": 1178112, "global_step/max_steps": "4602/6362"} +{"lm loss": 4.88005972, "grad_norm": 0.39793101, "learning_rate": 2.19e-05, "elapsed_time_per_iteration": 6.65324807, "memory(GiB)": 21.51, "elapsed_time": "8h 24m 18s", "remaining_time": "3h 12m 42s", "loss_scale": 1.0, "consumed_samples": 1178368, "global_step/max_steps": "4603/6362"} +{"lm loss": 4.88932753, "grad_norm": 0.36938599, "learning_rate": 2.188e-05, "elapsed_time_per_iteration": 6.68225861, "memory(GiB)": 21.51, "elapsed_time": "8h 24m 24s", "remaining_time": "3h 12m 36s", "loss_scale": 1.0, "consumed_samples": 1178624, "global_step/max_steps": "4604/6362"} +{"lm loss": 4.85203028, "grad_norm": 0.40324172, "learning_rate": 2.186e-05, "elapsed_time_per_iteration": 6.4234643, "memory(GiB)": 21.51, "elapsed_time": "8h 24m 31s", "remaining_time": "3h 12m 29s", "loss_scale": 1.0, "consumed_samples": 1178880, "global_step/max_steps": "4605/6362"} +{"lm loss": 4.88471413, "grad_norm": 0.39111689, "learning_rate": 2.184e-05, "elapsed_time_per_iteration": 6.45510197, "memory(GiB)": 21.51, "elapsed_time": "8h 24m 37s", "remaining_time": "3h 12m 23s", "loss_scale": 1.0, "consumed_samples": 1179136, "global_step/max_steps": "4606/6362"} +{"lm loss": 4.88624525, "grad_norm": 0.36487326, "learning_rate": 2.182e-05, "elapsed_time_per_iteration": 6.55050778, "memory(GiB)": 21.51, "elapsed_time": "8h 24m 44s", "remaining_time": "3h 12m 16s", "loss_scale": 1.0, "consumed_samples": 1179392, "global_step/max_steps": "4607/6362"} +{"lm loss": 4.85591221, "grad_norm": 0.37143371, "learning_rate": 2.18e-05, "elapsed_time_per_iteration": 6.67167163, "memory(GiB)": 21.51, "elapsed_time": "8h 24m 50s", "remaining_time": "3h 12m 9s", "loss_scale": 1.0, "consumed_samples": 1179648, "global_step/max_steps": "4608/6362"} +{"lm loss": 4.88771439, "grad_norm": 0.39952227, "learning_rate": 2.178e-05, "elapsed_time_per_iteration": 6.57757115, "memory(GiB)": 21.51, "elapsed_time": "8h 24m 57s", "remaining_time": "3h 12m 3s", "loss_scale": 1.0, "consumed_samples": 1179904, "global_step/max_steps": "4609/6362"} +{"lm loss": 4.86878967, "grad_norm": 0.35530111, "learning_rate": 2.176e-05, "elapsed_time_per_iteration": 6.61205149, "memory(GiB)": 21.51, "elapsed_time": "8h 25m 3s", "remaining_time": "3h 11m 56s", "loss_scale": 1.0, "consumed_samples": 1180160, "global_step/max_steps": "4610/6362"} +{"lm loss": 4.89369535, "grad_norm": 0.38405618, "learning_rate": 2.174e-05, "elapsed_time_per_iteration": 6.49687433, "memory(GiB)": 21.51, "elapsed_time": "8h 25m 10s", "remaining_time": "3h 11m 50s", "loss_scale": 1.0, "consumed_samples": 1180416, "global_step/max_steps": "4611/6362"} +{"lm loss": 4.91050625, "grad_norm": 0.38384545, "learning_rate": 2.172e-05, "elapsed_time_per_iteration": 6.6096704, "memory(GiB)": 21.51, "elapsed_time": "8h 25m 17s", "remaining_time": "3h 11m 43s", "loss_scale": 1.0, "consumed_samples": 1180672, "global_step/max_steps": "4612/6362"} +{"lm loss": 4.8953557, "grad_norm": 0.35867485, "learning_rate": 2.17e-05, "elapsed_time_per_iteration": 6.45119238, "memory(GiB)": 21.51, "elapsed_time": "8h 25m 23s", "remaining_time": "3h 11m 37s", "loss_scale": 1.0, "consumed_samples": 1180928, "global_step/max_steps": "4613/6362"} +{"lm loss": 4.8954525, "grad_norm": 0.40035635, "learning_rate": 2.168e-05, "elapsed_time_per_iteration": 6.58785677, "memory(GiB)": 21.51, "elapsed_time": "8h 25m 30s", "remaining_time": "3h 11m 30s", "loss_scale": 1.0, "consumed_samples": 1181184, "global_step/max_steps": "4614/6362"} +{"lm loss": 4.89111328, "grad_norm": 0.3796922, "learning_rate": 2.166e-05, "elapsed_time_per_iteration": 6.4488492, "memory(GiB)": 21.51, "elapsed_time": "8h 25m 36s", "remaining_time": "3h 11m 23s", "loss_scale": 1.0, "consumed_samples": 1181440, "global_step/max_steps": "4615/6362"} +{"lm loss": 4.88105869, "grad_norm": 0.38786399, "learning_rate": 2.164e-05, "elapsed_time_per_iteration": 6.5037365, "memory(GiB)": 21.51, "elapsed_time": "8h 25m 43s", "remaining_time": "3h 11m 17s", "loss_scale": 1.0, "consumed_samples": 1181696, "global_step/max_steps": "4616/6362"} +{"lm loss": 4.88716364, "grad_norm": 0.41486257, "learning_rate": 2.162e-05, "elapsed_time_per_iteration": 6.71305561, "memory(GiB)": 21.51, "elapsed_time": "8h 25m 49s", "remaining_time": "3h 11m 10s", "loss_scale": 1.0, "consumed_samples": 1181952, "global_step/max_steps": "4617/6362"} +{"lm loss": 4.88040972, "grad_norm": 0.36746556, "learning_rate": 2.16e-05, "elapsed_time_per_iteration": 6.33625579, "memory(GiB)": 21.51, "elapsed_time": "8h 25m 56s", "remaining_time": "3h 11m 4s", "loss_scale": 1.0, "consumed_samples": 1182208, "global_step/max_steps": "4618/6362"} +{"lm loss": 4.89599943, "grad_norm": 0.43006828, "learning_rate": 2.158e-05, "elapsed_time_per_iteration": 6.63693666, "memory(GiB)": 21.51, "elapsed_time": "8h 26m 2s", "remaining_time": "3h 10m 57s", "loss_scale": 1.0, "consumed_samples": 1182464, "global_step/max_steps": "4619/6362"} +{"lm loss": 4.88570595, "grad_norm": 0.34723669, "learning_rate": 2.156e-05, "elapsed_time_per_iteration": 6.43211722, "memory(GiB)": 21.51, "elapsed_time": "8h 26m 9s", "remaining_time": "3h 10m 50s", "loss_scale": 1.0, "consumed_samples": 1182720, "global_step/max_steps": "4620/6362"} +{"lm loss": 4.88155603, "grad_norm": 0.41525728, "learning_rate": 2.154e-05, "elapsed_time_per_iteration": 6.46307778, "memory(GiB)": 21.51, "elapsed_time": "8h 26m 15s", "remaining_time": "3h 10m 44s", "loss_scale": 1.0, "consumed_samples": 1182976, "global_step/max_steps": "4621/6362"} +{"lm loss": 4.89583111, "grad_norm": 0.37070104, "learning_rate": 2.152e-05, "elapsed_time_per_iteration": 6.59458351, "memory(GiB)": 21.51, "elapsed_time": "8h 26m 22s", "remaining_time": "3h 10m 37s", "loss_scale": 1.0, "consumed_samples": 1183232, "global_step/max_steps": "4622/6362"} +{"lm loss": 4.8536253, "grad_norm": 0.37674251, "learning_rate": 2.15e-05, "elapsed_time_per_iteration": 6.61269927, "memory(GiB)": 21.51, "elapsed_time": "8h 26m 28s", "remaining_time": "3h 10m 31s", "loss_scale": 1.0, "consumed_samples": 1183488, "global_step/max_steps": "4623/6362"} +{"lm loss": 4.87877226, "grad_norm": 0.38815325, "learning_rate": 2.148e-05, "elapsed_time_per_iteration": 6.44824147, "memory(GiB)": 21.51, "elapsed_time": "8h 26m 35s", "remaining_time": "3h 10m 24s", "loss_scale": 1.0, "consumed_samples": 1183744, "global_step/max_steps": "4624/6362"} +{"lm loss": 4.89925146, "grad_norm": 0.38389558, "learning_rate": 2.146e-05, "elapsed_time_per_iteration": 6.52349782, "memory(GiB)": 21.51, "elapsed_time": "8h 26m 41s", "remaining_time": "3h 10m 17s", "loss_scale": 1.0, "consumed_samples": 1184000, "global_step/max_steps": "4625/6362"} +{"lm loss": 4.91264009, "grad_norm": 0.40573359, "learning_rate": 2.144e-05, "elapsed_time_per_iteration": 6.4909389, "memory(GiB)": 21.51, "elapsed_time": "8h 26m 48s", "remaining_time": "3h 10m 11s", "loss_scale": 1.0, "consumed_samples": 1184256, "global_step/max_steps": "4626/6362"} +{"lm loss": 4.8973341, "grad_norm": 0.33767805, "learning_rate": 2.142e-05, "elapsed_time_per_iteration": 6.49002957, "memory(GiB)": 21.51, "elapsed_time": "8h 26m 54s", "remaining_time": "3h 10m 4s", "loss_scale": 1.0, "consumed_samples": 1184512, "global_step/max_steps": "4627/6362"} +{"lm loss": 4.88773489, "grad_norm": 0.39127228, "learning_rate": 2.14e-05, "elapsed_time_per_iteration": 6.63075328, "memory(GiB)": 21.51, "elapsed_time": "8h 27m 1s", "remaining_time": "3h 9m 58s", "loss_scale": 1.0, "consumed_samples": 1184768, "global_step/max_steps": "4628/6362"} +{"lm loss": 4.87106657, "grad_norm": 0.33887675, "learning_rate": 2.138e-05, "elapsed_time_per_iteration": 6.50112867, "memory(GiB)": 21.51, "elapsed_time": "8h 27m 7s", "remaining_time": "3h 9m 51s", "loss_scale": 1.0, "consumed_samples": 1185024, "global_step/max_steps": "4629/6362"} +{"lm loss": 4.89219379, "grad_norm": 0.39289987, "learning_rate": 2.136e-05, "elapsed_time_per_iteration": 6.53529358, "memory(GiB)": 21.51, "elapsed_time": "8h 27m 14s", "remaining_time": "3h 9m 44s", "loss_scale": 1.0, "consumed_samples": 1185280, "global_step/max_steps": "4630/6362"} +{"lm loss": 4.92384815, "grad_norm": 0.35613117, "learning_rate": 2.134e-05, "elapsed_time_per_iteration": 6.4642334, "memory(GiB)": 21.51, "elapsed_time": "8h 27m 20s", "remaining_time": "3h 9m 38s", "loss_scale": 1.0, "consumed_samples": 1185536, "global_step/max_steps": "4631/6362"} +{"lm loss": 4.88403177, "grad_norm": 0.34192669, "learning_rate": 2.132e-05, "elapsed_time_per_iteration": 6.65451574, "memory(GiB)": 21.51, "elapsed_time": "8h 27m 27s", "remaining_time": "3h 9m 31s", "loss_scale": 1.0, "consumed_samples": 1185792, "global_step/max_steps": "4632/6362"} +{"lm loss": 4.90250111, "grad_norm": 0.4123759, "learning_rate": 2.13e-05, "elapsed_time_per_iteration": 6.62040877, "memory(GiB)": 21.51, "elapsed_time": "8h 27m 34s", "remaining_time": "3h 9m 25s", "loss_scale": 1.0, "consumed_samples": 1186048, "global_step/max_steps": "4633/6362"} +{"lm loss": 4.87652016, "grad_norm": 0.34950727, "learning_rate": 2.128e-05, "elapsed_time_per_iteration": 6.56692624, "memory(GiB)": 21.51, "elapsed_time": "8h 27m 40s", "remaining_time": "3h 9m 18s", "loss_scale": 1.0, "consumed_samples": 1186304, "global_step/max_steps": "4634/6362"} +{"lm loss": 4.87856674, "grad_norm": 0.40524548, "learning_rate": 2.126e-05, "elapsed_time_per_iteration": 6.49386859, "memory(GiB)": 21.51, "elapsed_time": "8h 27m 47s", "remaining_time": "3h 9m 12s", "loss_scale": 1.0, "consumed_samples": 1186560, "global_step/max_steps": "4635/6362"} +{"lm loss": 4.88846588, "grad_norm": 0.37290949, "learning_rate": 2.124e-05, "elapsed_time_per_iteration": 6.61111641, "memory(GiB)": 21.51, "elapsed_time": "8h 27m 53s", "remaining_time": "3h 9m 5s", "loss_scale": 1.0, "consumed_samples": 1186816, "global_step/max_steps": "4636/6362"} +{"lm loss": 4.87114573, "grad_norm": 0.35021797, "learning_rate": 2.122e-05, "elapsed_time_per_iteration": 6.72904325, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 0s", "remaining_time": "3h 8m 59s", "loss_scale": 1.0, "consumed_samples": 1187072, "global_step/max_steps": "4637/6362"} +{"lm loss": 4.90416145, "grad_norm": 0.3705129, "learning_rate": 2.121e-05, "elapsed_time_per_iteration": 6.41528964, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 7s", "remaining_time": "3h 8m 52s", "loss_scale": 1.0, "consumed_samples": 1187328, "global_step/max_steps": "4638/6362"} +{"lm loss": 4.87842178, "grad_norm": 0.33515489, "learning_rate": 2.119e-05, "elapsed_time_per_iteration": 6.65182662, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 13s", "remaining_time": "3h 8m 45s", "loss_scale": 1.0, "consumed_samples": 1187584, "global_step/max_steps": "4639/6362"} +{"lm loss": 4.88395643, "grad_norm": 0.38325307, "learning_rate": 2.117e-05, "elapsed_time_per_iteration": 6.56374145, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 20s", "remaining_time": "3h 8m 39s", "loss_scale": 1.0, "consumed_samples": 1187840, "global_step/max_steps": "4640/6362"} +{"lm loss": 4.88779783, "grad_norm": 0.35087788, "learning_rate": 2.115e-05, "elapsed_time_per_iteration": 6.6680963, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 26s", "remaining_time": "3h 8m 32s", "loss_scale": 1.0, "consumed_samples": 1188096, "global_step/max_steps": "4641/6362"} +{"lm loss": 4.89195108, "grad_norm": 0.40210065, "learning_rate": 2.113e-05, "elapsed_time_per_iteration": 6.5790031, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 33s", "remaining_time": "3h 8m 26s", "loss_scale": 1.0, "consumed_samples": 1188352, "global_step/max_steps": "4642/6362"} +{"lm loss": 4.87118769, "grad_norm": 0.36667717, "learning_rate": 2.111e-05, "elapsed_time_per_iteration": 6.58545017, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 40s", "remaining_time": "3h 8m 19s", "loss_scale": 1.0, "consumed_samples": 1188608, "global_step/max_steps": "4643/6362"} +{"lm loss": 4.87115431, "grad_norm": 0.37118229, "learning_rate": 2.109e-05, "elapsed_time_per_iteration": 6.57772541, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 46s", "remaining_time": "3h 8m 13s", "loss_scale": 1.0, "consumed_samples": 1188864, "global_step/max_steps": "4644/6362"} +{"lm loss": 4.9022398, "grad_norm": 0.37984064, "learning_rate": 2.107e-05, "elapsed_time_per_iteration": 6.4769907, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 53s", "remaining_time": "3h 8m 6s", "loss_scale": 1.0, "consumed_samples": 1189120, "global_step/max_steps": "4645/6362"} +{"lm loss": 4.89664221, "grad_norm": 0.3625598, "learning_rate": 2.105e-05, "elapsed_time_per_iteration": 6.37250805, "memory(GiB)": 21.51, "elapsed_time": "8h 28m 59s", "remaining_time": "3h 7m 59s", "loss_scale": 1.0, "consumed_samples": 1189376, "global_step/max_steps": "4646/6362"} +{"lm loss": 4.89191389, "grad_norm": 0.39365688, "learning_rate": 2.103e-05, "elapsed_time_per_iteration": 6.4151454, "memory(GiB)": 21.51, "elapsed_time": "8h 29m 5s", "remaining_time": "3h 7m 53s", "loss_scale": 1.0, "consumed_samples": 1189632, "global_step/max_steps": "4647/6362"} +{"lm loss": 4.86658096, "grad_norm": 0.4018304, "learning_rate": 2.101e-05, "elapsed_time_per_iteration": 6.70469618, "memory(GiB)": 21.51, "elapsed_time": "8h 29m 12s", "remaining_time": "3h 7m 46s", "loss_scale": 1.0, "consumed_samples": 1189888, "global_step/max_steps": "4648/6362"} +{"lm loss": 4.88519812, "grad_norm": 0.3448278, "learning_rate": 2.099e-05, "elapsed_time_per_iteration": 6.40229249, "memory(GiB)": 21.51, "elapsed_time": "8h 29m 19s", "remaining_time": "3h 7m 39s", "loss_scale": 1.0, "consumed_samples": 1190144, "global_step/max_steps": "4649/6362"} +{"lm loss": 4.86550617, "grad_norm": 0.39584768, "learning_rate": 2.097e-05, "elapsed_time_per_iteration": 6.47629666, "memory(GiB)": 21.51, "elapsed_time": "8h 29m 25s", "remaining_time": "3h 7m 33s", "loss_scale": 1.0, "consumed_samples": 1190400, "global_step/max_steps": "4650/6362"} +{"lm loss": 4.90372181, "grad_norm": 0.37111783, "learning_rate": 2.095e-05, "elapsed_time_per_iteration": 6.49460673, "memory(GiB)": 21.51, "elapsed_time": "8h 29m 32s", "remaining_time": "3h 7m 26s", "loss_scale": 1.0, "consumed_samples": 1190656, "global_step/max_steps": "4651/6362"} +{"lm loss": 4.86994553, "grad_norm": 0.37516052, "learning_rate": 2.093e-05, "elapsed_time_per_iteration": 6.41760063, "memory(GiB)": 21.51, "elapsed_time": "8h 29m 38s", "remaining_time": "3h 7m 20s", "loss_scale": 1.0, "consumed_samples": 1190912, "global_step/max_steps": "4652/6362"} +{"lm loss": 4.90091944, "grad_norm": 0.36652958, "learning_rate": 2.091e-05, "elapsed_time_per_iteration": 6.60599804, "memory(GiB)": 21.51, "elapsed_time": "8h 29m 45s", "remaining_time": "3h 7m 13s", "loss_scale": 1.0, "consumed_samples": 1191168, "global_step/max_steps": "4653/6362"} +{"lm loss": 4.9015975, "grad_norm": 0.36421412, "learning_rate": 2.089e-05, "elapsed_time_per_iteration": 6.47141981, "memory(GiB)": 21.51, "elapsed_time": "8h 29m 51s", "remaining_time": "3h 7m 6s", "loss_scale": 1.0, "consumed_samples": 1191424, "global_step/max_steps": "4654/6362"} +{"lm loss": 4.86652565, "grad_norm": 0.37896532, "learning_rate": 2.087e-05, "elapsed_time_per_iteration": 6.60563207, "memory(GiB)": 21.51, "elapsed_time": "8h 29m 58s", "remaining_time": "3h 7m 0s", "loss_scale": 1.0, "consumed_samples": 1191680, "global_step/max_steps": "4655/6362"} +{"lm loss": 4.89368486, "grad_norm": 0.3712509, "learning_rate": 2.085e-05, "elapsed_time_per_iteration": 6.53270769, "memory(GiB)": 21.51, "elapsed_time": "8h 30m 4s", "remaining_time": "3h 6m 53s", "loss_scale": 1.0, "consumed_samples": 1191936, "global_step/max_steps": "4656/6362"} +{"lm loss": 4.88791609, "grad_norm": 0.35683185, "learning_rate": 2.083e-05, "elapsed_time_per_iteration": 6.31342316, "memory(GiB)": 21.51, "elapsed_time": "8h 30m 10s", "remaining_time": "3h 6m 47s", "loss_scale": 1.0, "consumed_samples": 1192192, "global_step/max_steps": "4657/6362"} +{"lm loss": 4.85679483, "grad_norm": 0.36563477, "learning_rate": 2.081e-05, "elapsed_time_per_iteration": 6.73098636, "memory(GiB)": 21.51, "elapsed_time": "8h 30m 17s", "remaining_time": "3h 6m 40s", "loss_scale": 1.0, "consumed_samples": 1192448, "global_step/max_steps": "4658/6362"} +{"lm loss": 4.86653328, "grad_norm": 0.35621649, "learning_rate": 2.079e-05, "elapsed_time_per_iteration": 6.61120105, "memory(GiB)": 21.51, "elapsed_time": "8h 30m 24s", "remaining_time": "3h 6m 34s", "loss_scale": 1.0, "consumed_samples": 1192704, "global_step/max_steps": "4659/6362"} +{"lm loss": 4.86424589, "grad_norm": 0.38061029, "learning_rate": 2.077e-05, "elapsed_time_per_iteration": 6.4993577, "memory(GiB)": 21.51, "elapsed_time": "8h 30m 30s", "remaining_time": "3h 6m 27s", "loss_scale": 1.0, "consumed_samples": 1192960, "global_step/max_steps": "4660/6362"} +{"lm loss": 4.88770628, "grad_norm": 0.34255365, "learning_rate": 2.075e-05, "elapsed_time_per_iteration": 6.60400844, "memory(GiB)": 21.51, "elapsed_time": "8h 30m 37s", "remaining_time": "3h 6m 20s", "loss_scale": 1.0, "consumed_samples": 1193216, "global_step/max_steps": "4661/6362"} +{"lm loss": 4.88432598, "grad_norm": 0.35757107, "learning_rate": 2.073e-05, "elapsed_time_per_iteration": 6.39499927, "memory(GiB)": 21.51, "elapsed_time": "8h 30m 43s", "remaining_time": "3h 6m 14s", "loss_scale": 1.0, "consumed_samples": 1193472, "global_step/max_steps": "4662/6362"} +{"lm loss": 4.88821793, "grad_norm": 0.35459521, "learning_rate": 2.072e-05, "elapsed_time_per_iteration": 6.36399388, "memory(GiB)": 21.51, "elapsed_time": "8h 30m 50s", "remaining_time": "3h 6m 7s", "loss_scale": 1.0, "consumed_samples": 1193728, "global_step/max_steps": "4663/6362"} +{"lm loss": 4.88208866, "grad_norm": 0.34237382, "learning_rate": 2.07e-05, "elapsed_time_per_iteration": 6.69007444, "memory(GiB)": 21.51, "elapsed_time": "8h 30m 56s", "remaining_time": "3h 6m 1s", "loss_scale": 1.0, "consumed_samples": 1193984, "global_step/max_steps": "4664/6362"} +{"lm loss": 4.84445429, "grad_norm": 0.36159194, "learning_rate": 2.068e-05, "elapsed_time_per_iteration": 6.66223979, "memory(GiB)": 21.51, "elapsed_time": "8h 31m 3s", "remaining_time": "3h 5m 54s", "loss_scale": 1.0, "consumed_samples": 1194240, "global_step/max_steps": "4665/6362"} +{"lm loss": 4.89905739, "grad_norm": 0.36150891, "learning_rate": 2.066e-05, "elapsed_time_per_iteration": 6.68956709, "memory(GiB)": 21.51, "elapsed_time": "8h 31m 10s", "remaining_time": "3h 5m 48s", "loss_scale": 1.0, "consumed_samples": 1194496, "global_step/max_steps": "4666/6362"} +{"lm loss": 4.8984251, "grad_norm": 0.35052103, "learning_rate": 2.064e-05, "elapsed_time_per_iteration": 6.52440667, "memory(GiB)": 21.51, "elapsed_time": "8h 31m 16s", "remaining_time": "3h 5m 41s", "loss_scale": 1.0, "consumed_samples": 1194752, "global_step/max_steps": "4667/6362"} +{"lm loss": 4.87004328, "grad_norm": 0.35444531, "learning_rate": 2.062e-05, "elapsed_time_per_iteration": 6.68094158, "memory(GiB)": 21.51, "elapsed_time": "8h 31m 23s", "remaining_time": "3h 5m 34s", "loss_scale": 1.0, "consumed_samples": 1195008, "global_step/max_steps": "4668/6362"} +{"lm loss": 4.86086035, "grad_norm": 0.35520044, "learning_rate": 2.06e-05, "elapsed_time_per_iteration": 6.53099036, "memory(GiB)": 21.51, "elapsed_time": "8h 31m 29s", "remaining_time": "3h 5m 28s", "loss_scale": 1.0, "consumed_samples": 1195264, "global_step/max_steps": "4669/6362"} +{"lm loss": 4.8820014, "grad_norm": 0.38432229, "learning_rate": 2.058e-05, "elapsed_time_per_iteration": 6.66131377, "memory(GiB)": 21.51, "elapsed_time": "8h 31m 36s", "remaining_time": "3h 5m 21s", "loss_scale": 1.0, "consumed_samples": 1195520, "global_step/max_steps": "4670/6362"} +{"lm loss": 4.85858631, "grad_norm": 0.37369946, "learning_rate": 2.056e-05, "elapsed_time_per_iteration": 6.3230648, "memory(GiB)": 21.51, "elapsed_time": "8h 31m 42s", "remaining_time": "3h 5m 15s", "loss_scale": 1.0, "consumed_samples": 1195776, "global_step/max_steps": "4671/6362"} +{"lm loss": 4.88908386, "grad_norm": 0.38787806, "learning_rate": 2.054e-05, "elapsed_time_per_iteration": 6.55292559, "memory(GiB)": 21.51, "elapsed_time": "8h 31m 49s", "remaining_time": "3h 5m 8s", "loss_scale": 1.0, "consumed_samples": 1196032, "global_step/max_steps": "4672/6362"} +{"lm loss": 4.88093042, "grad_norm": 0.35646424, "learning_rate": 2.052e-05, "elapsed_time_per_iteration": 6.38718128, "memory(GiB)": 21.51, "elapsed_time": "8h 31m 55s", "remaining_time": "3h 5m 1s", "loss_scale": 1.0, "consumed_samples": 1196288, "global_step/max_steps": "4673/6362"} +{"lm loss": 4.8813138, "grad_norm": 0.36548194, "learning_rate": 2.05e-05, "elapsed_time_per_iteration": 6.6750145, "memory(GiB)": 21.51, "elapsed_time": "8h 32m 2s", "remaining_time": "3h 4m 55s", "loss_scale": 1.0, "consumed_samples": 1196544, "global_step/max_steps": "4674/6362"} +{"lm loss": 4.87576818, "grad_norm": 0.35856956, "learning_rate": 2.048e-05, "elapsed_time_per_iteration": 6.51450348, "memory(GiB)": 21.51, "elapsed_time": "8h 32m 9s", "remaining_time": "3h 4m 48s", "loss_scale": 1.0, "consumed_samples": 1196800, "global_step/max_steps": "4675/6362"} +{"lm loss": 4.89032793, "grad_norm": 0.36491168, "learning_rate": 2.046e-05, "elapsed_time_per_iteration": 6.61111856, "memory(GiB)": 21.51, "elapsed_time": "8h 32m 15s", "remaining_time": "3h 4m 42s", "loss_scale": 1.0, "consumed_samples": 1197056, "global_step/max_steps": "4676/6362"} +{"lm loss": 4.89630032, "grad_norm": 0.40738398, "learning_rate": 2.044e-05, "elapsed_time_per_iteration": 6.55912209, "memory(GiB)": 21.51, "elapsed_time": "8h 32m 22s", "remaining_time": "3h 4m 35s", "loss_scale": 1.0, "consumed_samples": 1197312, "global_step/max_steps": "4677/6362"} +{"lm loss": 4.88386297, "grad_norm": 0.35915503, "learning_rate": 2.042e-05, "elapsed_time_per_iteration": 6.65191865, "memory(GiB)": 21.51, "elapsed_time": "8h 32m 28s", "remaining_time": "3h 4m 29s", "loss_scale": 1.0, "consumed_samples": 1197568, "global_step/max_steps": "4678/6362"} +{"lm loss": 4.88070869, "grad_norm": 0.37792206, "learning_rate": 2.04e-05, "elapsed_time_per_iteration": 6.70586133, "memory(GiB)": 21.51, "elapsed_time": "8h 32m 35s", "remaining_time": "3h 4m 22s", "loss_scale": 1.0, "consumed_samples": 1197824, "global_step/max_steps": "4679/6362"} +{"lm loss": 4.89029169, "grad_norm": 0.36187452, "learning_rate": 2.039e-05, "elapsed_time_per_iteration": 6.63748503, "memory(GiB)": 21.51, "elapsed_time": "8h 32m 42s", "remaining_time": "3h 4m 15s", "loss_scale": 1.0, "consumed_samples": 1198080, "global_step/max_steps": "4680/6362"} +{"lm loss": 4.89687109, "grad_norm": 0.37176245, "learning_rate": 2.037e-05, "elapsed_time_per_iteration": 6.5705359, "memory(GiB)": 21.51, "elapsed_time": "8h 32m 48s", "remaining_time": "3h 4m 9s", "loss_scale": 1.0, "consumed_samples": 1198336, "global_step/max_steps": "4681/6362"} +{"lm loss": 4.89261627, "grad_norm": 0.35311216, "learning_rate": 2.035e-05, "elapsed_time_per_iteration": 6.47520709, "memory(GiB)": 21.51, "elapsed_time": "8h 32m 55s", "remaining_time": "3h 4m 2s", "loss_scale": 1.0, "consumed_samples": 1198592, "global_step/max_steps": "4682/6362"} +{"lm loss": 4.86439466, "grad_norm": 0.381152, "learning_rate": 2.033e-05, "elapsed_time_per_iteration": 6.47212243, "memory(GiB)": 21.51, "elapsed_time": "8h 33m 1s", "remaining_time": "3h 3m 56s", "loss_scale": 1.0, "consumed_samples": 1198848, "global_step/max_steps": "4683/6362"} +{"lm loss": 4.90758038, "grad_norm": 0.35494906, "learning_rate": 2.031e-05, "elapsed_time_per_iteration": 6.50565886, "memory(GiB)": 21.51, "elapsed_time": "8h 33m 8s", "remaining_time": "3h 3m 49s", "loss_scale": 1.0, "consumed_samples": 1199104, "global_step/max_steps": "4684/6362"} +{"lm loss": 4.88997078, "grad_norm": 0.38931021, "learning_rate": 2.029e-05, "elapsed_time_per_iteration": 6.39246941, "memory(GiB)": 21.51, "elapsed_time": "8h 33m 14s", "remaining_time": "3h 3m 42s", "loss_scale": 1.0, "consumed_samples": 1199360, "global_step/max_steps": "4685/6362"} +{"lm loss": 4.8798399, "grad_norm": 0.33413717, "learning_rate": 2.027e-05, "elapsed_time_per_iteration": 6.43226814, "memory(GiB)": 21.51, "elapsed_time": "8h 33m 21s", "remaining_time": "3h 3m 36s", "loss_scale": 1.0, "consumed_samples": 1199616, "global_step/max_steps": "4686/6362"} +{"lm loss": 4.89299107, "grad_norm": 0.36820456, "learning_rate": 2.025e-05, "elapsed_time_per_iteration": 6.61353302, "memory(GiB)": 21.51, "elapsed_time": "8h 33m 27s", "remaining_time": "3h 3m 29s", "loss_scale": 1.0, "consumed_samples": 1199872, "global_step/max_steps": "4687/6362"} +{"lm loss": 4.89707232, "grad_norm": 0.35843936, "learning_rate": 2.023e-05, "elapsed_time_per_iteration": 6.59625912, "memory(GiB)": 21.51, "elapsed_time": "8h 33m 34s", "remaining_time": "3h 3m 23s", "loss_scale": 1.0, "consumed_samples": 1200128, "global_step/max_steps": "4688/6362"} +{"lm loss": 4.85846424, "grad_norm": 0.34451565, "learning_rate": 2.021e-05, "elapsed_time_per_iteration": 6.60367966, "memory(GiB)": 21.51, "elapsed_time": "8h 33m 40s", "remaining_time": "3h 3m 16s", "loss_scale": 1.0, "consumed_samples": 1200384, "global_step/max_steps": "4689/6362"} +{"lm loss": 4.88356304, "grad_norm": 0.36337775, "learning_rate": 2.019e-05, "elapsed_time_per_iteration": 6.60524511, "memory(GiB)": 21.51, "elapsed_time": "8h 33m 47s", "remaining_time": "3h 3m 10s", "loss_scale": 1.0, "consumed_samples": 1200640, "global_step/max_steps": "4690/6362"} +{"lm loss": 4.89048624, "grad_norm": 0.33711091, "learning_rate": 2.017e-05, "elapsed_time_per_iteration": 6.79319191, "memory(GiB)": 21.51, "elapsed_time": "8h 33m 54s", "remaining_time": "3h 3m 3s", "loss_scale": 1.0, "consumed_samples": 1200896, "global_step/max_steps": "4691/6362"} +{"lm loss": 4.89286232, "grad_norm": 0.37207198, "learning_rate": 2.015e-05, "elapsed_time_per_iteration": 6.57333207, "memory(GiB)": 21.51, "elapsed_time": "8h 34m 0s", "remaining_time": "3h 2m 57s", "loss_scale": 1.0, "consumed_samples": 1201152, "global_step/max_steps": "4692/6362"} +{"lm loss": 4.88433456, "grad_norm": 0.33585033, "learning_rate": 2.013e-05, "elapsed_time_per_iteration": 6.76051378, "memory(GiB)": 21.51, "elapsed_time": "8h 34m 7s", "remaining_time": "3h 2m 50s", "loss_scale": 1.0, "consumed_samples": 1201408, "global_step/max_steps": "4693/6362"} +{"lm loss": 4.88845682, "grad_norm": 0.36256987, "learning_rate": 2.012e-05, "elapsed_time_per_iteration": 6.83637214, "memory(GiB)": 21.51, "elapsed_time": "8h 34m 14s", "remaining_time": "3h 2m 44s", "loss_scale": 1.0, "consumed_samples": 1201664, "global_step/max_steps": "4694/6362"} +{"lm loss": 4.90112352, "grad_norm": 0.3477574, "learning_rate": 2.01e-05, "elapsed_time_per_iteration": 6.44376469, "memory(GiB)": 21.51, "elapsed_time": "8h 34m 20s", "remaining_time": "3h 2m 37s", "loss_scale": 1.0, "consumed_samples": 1201920, "global_step/max_steps": "4695/6362"} +{"lm loss": 4.90228462, "grad_norm": 0.37111413, "learning_rate": 2.008e-05, "elapsed_time_per_iteration": 6.50237465, "memory(GiB)": 21.51, "elapsed_time": "8h 34m 27s", "remaining_time": "3h 2m 30s", "loss_scale": 1.0, "consumed_samples": 1202176, "global_step/max_steps": "4696/6362"} +{"lm loss": 4.8766222, "grad_norm": 0.32948336, "learning_rate": 2.006e-05, "elapsed_time_per_iteration": 6.47245145, "memory(GiB)": 21.51, "elapsed_time": "8h 34m 33s", "remaining_time": "3h 2m 24s", "loss_scale": 1.0, "consumed_samples": 1202432, "global_step/max_steps": "4697/6362"} +{"lm loss": 4.89375019, "grad_norm": 0.36189112, "learning_rate": 2.004e-05, "elapsed_time_per_iteration": 6.58089709, "memory(GiB)": 21.51, "elapsed_time": "8h 34m 40s", "remaining_time": "3h 2m 17s", "loss_scale": 1.0, "consumed_samples": 1202688, "global_step/max_steps": "4698/6362"} +{"lm loss": 4.87432337, "grad_norm": 0.33672252, "learning_rate": 2.002e-05, "elapsed_time_per_iteration": 6.57843971, "memory(GiB)": 21.51, "elapsed_time": "8h 34m 47s", "remaining_time": "3h 2m 11s", "loss_scale": 1.0, "consumed_samples": 1202944, "global_step/max_steps": "4699/6362"} +{"lm loss": 4.89219093, "grad_norm": 0.36449552, "learning_rate": 2e-05, "elapsed_time_per_iteration": 6.6153667, "memory(GiB)": 21.51, "elapsed_time": "8h 34m 53s", "remaining_time": "3h 2m 4s", "loss_scale": 1.0, "consumed_samples": 1203200, "global_step/max_steps": "4700/6362"} +{"lm loss": 4.90437603, "grad_norm": 0.35759372, "learning_rate": 1.998e-05, "elapsed_time_per_iteration": 6.45748973, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 0s", "remaining_time": "3h 1m 57s", "loss_scale": 1.0, "consumed_samples": 1203456, "global_step/max_steps": "4701/6362"} +{"lm loss": 4.88059187, "grad_norm": 0.32683131, "learning_rate": 1.996e-05, "elapsed_time_per_iteration": 6.37763166, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 6s", "remaining_time": "3h 1m 51s", "loss_scale": 1.0, "consumed_samples": 1203712, "global_step/max_steps": "4702/6362"} +{"lm loss": 4.88438892, "grad_norm": 0.3815082, "learning_rate": 1.994e-05, "elapsed_time_per_iteration": 6.35133815, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 12s", "remaining_time": "3h 1m 44s", "loss_scale": 1.0, "consumed_samples": 1203968, "global_step/max_steps": "4703/6362"} +{"lm loss": 4.89730835, "grad_norm": 0.35843399, "learning_rate": 1.992e-05, "elapsed_time_per_iteration": 6.7367239, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 19s", "remaining_time": "3h 1m 38s", "loss_scale": 1.0, "consumed_samples": 1204224, "global_step/max_steps": "4704/6362"} +{"lm loss": 4.862679, "grad_norm": 0.38448602, "learning_rate": 1.99e-05, "elapsed_time_per_iteration": 6.55264878, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 26s", "remaining_time": "3h 1m 31s", "loss_scale": 1.0, "consumed_samples": 1204480, "global_step/max_steps": "4705/6362"} +{"lm loss": 4.90133095, "grad_norm": 0.35439286, "learning_rate": 1.989e-05, "elapsed_time_per_iteration": 6.89356112, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 33s", "remaining_time": "3h 1m 25s", "loss_scale": 1.0, "consumed_samples": 1204736, "global_step/max_steps": "4706/6362"} +{"lm loss": 4.89297867, "grad_norm": 0.34417275, "learning_rate": 1.987e-05, "elapsed_time_per_iteration": 6.22214055, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 39s", "remaining_time": "3h 1m 18s", "loss_scale": 1.0, "consumed_samples": 1204992, "global_step/max_steps": "4707/6362"} +{"lm loss": 4.87593174, "grad_norm": 0.3882834, "learning_rate": 1.985e-05, "elapsed_time_per_iteration": 6.70439076, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 45s", "remaining_time": "3h 1m 11s", "loss_scale": 1.0, "consumed_samples": 1205248, "global_step/max_steps": "4708/6362"} +{"lm loss": 4.89500093, "grad_norm": 0.32086024, "learning_rate": 1.983e-05, "elapsed_time_per_iteration": 6.43548465, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 52s", "remaining_time": "3h 1m 5s", "loss_scale": 1.0, "consumed_samples": 1205504, "global_step/max_steps": "4709/6362"} +{"lm loss": 4.89813709, "grad_norm": 0.37592423, "learning_rate": 1.981e-05, "elapsed_time_per_iteration": 6.45109153, "memory(GiB)": 21.51, "elapsed_time": "8h 35m 58s", "remaining_time": "3h 0m 58s", "loss_scale": 1.0, "consumed_samples": 1205760, "global_step/max_steps": "4710/6362"} +{"lm loss": 4.85903835, "grad_norm": 0.3255578, "learning_rate": 1.979e-05, "elapsed_time_per_iteration": 6.50081897, "memory(GiB)": 21.51, "elapsed_time": "8h 36m 5s", "remaining_time": "3h 0m 51s", "loss_scale": 1.0, "consumed_samples": 1206016, "global_step/max_steps": "4711/6362"} +{"lm loss": 4.88171768, "grad_norm": 0.36728299, "learning_rate": 1.977e-05, "elapsed_time_per_iteration": 6.53736806, "memory(GiB)": 21.51, "elapsed_time": "8h 36m 11s", "remaining_time": "3h 0m 45s", "loss_scale": 1.0, "consumed_samples": 1206272, "global_step/max_steps": "4712/6362"} +{"lm loss": 4.86997604, "grad_norm": 0.32526147, "learning_rate": 1.975e-05, "elapsed_time_per_iteration": 6.47966719, "memory(GiB)": 21.51, "elapsed_time": "8h 36m 18s", "remaining_time": "3h 0m 38s", "loss_scale": 1.0, "consumed_samples": 1206528, "global_step/max_steps": "4713/6362"} +{"lm loss": 4.87338638, "grad_norm": 0.37561935, "learning_rate": 1.973e-05, "elapsed_time_per_iteration": 6.51076531, "memory(GiB)": 21.51, "elapsed_time": "8h 36m 24s", "remaining_time": "3h 0m 32s", "loss_scale": 1.0, "consumed_samples": 1206784, "global_step/max_steps": "4714/6362"} +{"lm loss": 4.88436031, "grad_norm": 0.34139678, "learning_rate": 1.971e-05, "elapsed_time_per_iteration": 6.59334826, "memory(GiB)": 21.51, "elapsed_time": "8h 36m 31s", "remaining_time": "3h 0m 25s", "loss_scale": 1.0, "consumed_samples": 1207040, "global_step/max_steps": "4715/6362"} +{"lm loss": 4.86386681, "grad_norm": 0.36761305, "learning_rate": 1.969e-05, "elapsed_time_per_iteration": 6.68307185, "memory(GiB)": 21.51, "elapsed_time": "8h 36m 38s", "remaining_time": "3h 0m 19s", "loss_scale": 1.0, "consumed_samples": 1207296, "global_step/max_steps": "4716/6362"} +{"lm loss": 4.85287142, "grad_norm": 0.38186848, "learning_rate": 1.968e-05, "elapsed_time_per_iteration": 6.60519052, "memory(GiB)": 21.51, "elapsed_time": "8h 36m 44s", "remaining_time": "3h 0m 12s", "loss_scale": 1.0, "consumed_samples": 1207552, "global_step/max_steps": "4717/6362"} +{"lm loss": 4.90774679, "grad_norm": 0.36846754, "learning_rate": 1.966e-05, "elapsed_time_per_iteration": 6.58129191, "memory(GiB)": 21.51, "elapsed_time": "8h 36m 51s", "remaining_time": "3h 0m 5s", "loss_scale": 1.0, "consumed_samples": 1207808, "global_step/max_steps": "4718/6362"} +{"lm loss": 4.87260866, "grad_norm": 0.44058758, "learning_rate": 1.964e-05, "elapsed_time_per_iteration": 6.60963726, "memory(GiB)": 21.51, "elapsed_time": "8h 36m 57s", "remaining_time": "2h 59m 59s", "loss_scale": 1.0, "consumed_samples": 1208064, "global_step/max_steps": "4719/6362"} +{"lm loss": 4.84344816, "grad_norm": 0.38297477, "learning_rate": 1.962e-05, "elapsed_time_per_iteration": 6.35397673, "memory(GiB)": 21.51, "elapsed_time": "8h 37m 4s", "remaining_time": "2h 59m 52s", "loss_scale": 1.0, "consumed_samples": 1208320, "global_step/max_steps": "4720/6362"} +{"lm loss": 4.88531876, "grad_norm": 0.37810081, "learning_rate": 1.96e-05, "elapsed_time_per_iteration": 6.66879725, "memory(GiB)": 21.51, "elapsed_time": "8h 37m 10s", "remaining_time": "2h 59m 46s", "loss_scale": 1.0, "consumed_samples": 1208576, "global_step/max_steps": "4721/6362"} +{"lm loss": 4.87458801, "grad_norm": 0.43557364, "learning_rate": 1.958e-05, "elapsed_time_per_iteration": 6.6050806, "memory(GiB)": 21.51, "elapsed_time": "8h 37m 17s", "remaining_time": "2h 59m 39s", "loss_scale": 1.0, "consumed_samples": 1208832, "global_step/max_steps": "4722/6362"} +{"lm loss": 4.8785677, "grad_norm": 0.3469595, "learning_rate": 1.956e-05, "elapsed_time_per_iteration": 6.96142483, "memory(GiB)": 21.51, "elapsed_time": "8h 37m 24s", "remaining_time": "2h 59m 33s", "loss_scale": 1.0, "consumed_samples": 1209088, "global_step/max_steps": "4723/6362"} +{"lm loss": 4.8600359, "grad_norm": 0.38483337, "learning_rate": 1.954e-05, "elapsed_time_per_iteration": 6.63627434, "memory(GiB)": 21.51, "elapsed_time": "8h 37m 31s", "remaining_time": "2h 59m 26s", "loss_scale": 1.0, "consumed_samples": 1209344, "global_step/max_steps": "4724/6362"} +{"lm loss": 4.88864565, "grad_norm": 0.36872959, "learning_rate": 1.952e-05, "elapsed_time_per_iteration": 6.78609896, "memory(GiB)": 21.51, "elapsed_time": "8h 37m 37s", "remaining_time": "2h 59m 20s", "loss_scale": 1.0, "consumed_samples": 1209600, "global_step/max_steps": "4725/6362"} +{"lm loss": 4.88220692, "grad_norm": 0.35962182, "learning_rate": 1.95e-05, "elapsed_time_per_iteration": 7.10464978, "memory(GiB)": 21.51, "elapsed_time": "8h 37m 45s", "remaining_time": "2h 59m 13s", "loss_scale": 1.0, "consumed_samples": 1209856, "global_step/max_steps": "4726/6362"} +{"lm loss": 4.87244749, "grad_norm": 0.36292905, "learning_rate": 1.949e-05, "elapsed_time_per_iteration": 6.62901688, "memory(GiB)": 21.51, "elapsed_time": "8h 37m 51s", "remaining_time": "2h 59m 7s", "loss_scale": 1.0, "consumed_samples": 1210112, "global_step/max_steps": "4727/6362"} +{"lm loss": 4.87491751, "grad_norm": 0.33631423, "learning_rate": 1.947e-05, "elapsed_time_per_iteration": 6.73178792, "memory(GiB)": 21.51, "elapsed_time": "8h 37m 58s", "remaining_time": "2h 59m 0s", "loss_scale": 1.0, "consumed_samples": 1210368, "global_step/max_steps": "4728/6362"} +{"lm loss": 4.86433411, "grad_norm": 0.33605137, "learning_rate": 1.945e-05, "elapsed_time_per_iteration": 6.55030346, "memory(GiB)": 21.51, "elapsed_time": "8h 38m 4s", "remaining_time": "2h 58m 54s", "loss_scale": 1.0, "consumed_samples": 1210624, "global_step/max_steps": "4729/6362"} +{"lm loss": 4.91547871, "grad_norm": 0.33371025, "learning_rate": 1.943e-05, "elapsed_time_per_iteration": 6.53990674, "memory(GiB)": 21.51, "elapsed_time": "8h 38m 11s", "remaining_time": "2h 58m 47s", "loss_scale": 1.0, "consumed_samples": 1210880, "global_step/max_steps": "4730/6362"} +{"lm loss": 4.88108969, "grad_norm": 0.36007836, "learning_rate": 1.941e-05, "elapsed_time_per_iteration": 6.77538085, "memory(GiB)": 21.51, "elapsed_time": "8h 38m 18s", "remaining_time": "2h 58m 41s", "loss_scale": 1.0, "consumed_samples": 1211136, "global_step/max_steps": "4731/6362"} +{"lm loss": 4.89645243, "grad_norm": 0.3363972, "learning_rate": 1.939e-05, "elapsed_time_per_iteration": 6.80834079, "memory(GiB)": 21.51, "elapsed_time": "8h 38m 25s", "remaining_time": "2h 58m 34s", "loss_scale": 1.0, "consumed_samples": 1211392, "global_step/max_steps": "4732/6362"} +{"lm loss": 4.88078499, "grad_norm": 0.3600598, "learning_rate": 1.937e-05, "elapsed_time_per_iteration": 6.68172669, "memory(GiB)": 21.51, "elapsed_time": "8h 38m 31s", "remaining_time": "2h 58m 28s", "loss_scale": 1.0, "consumed_samples": 1211648, "global_step/max_steps": "4733/6362"} +{"lm loss": 4.85871744, "grad_norm": 0.3494198, "learning_rate": 1.935e-05, "elapsed_time_per_iteration": 6.49259949, "memory(GiB)": 21.51, "elapsed_time": "8h 38m 38s", "remaining_time": "2h 58m 21s", "loss_scale": 1.0, "consumed_samples": 1211904, "global_step/max_steps": "4734/6362"} +{"lm loss": 4.86481905, "grad_norm": 0.36703089, "learning_rate": 1.933e-05, "elapsed_time_per_iteration": 6.74549699, "memory(GiB)": 21.51, "elapsed_time": "8h 38m 44s", "remaining_time": "2h 58m 14s", "loss_scale": 1.0, "consumed_samples": 1212160, "global_step/max_steps": "4735/6362"} +{"lm loss": 4.89449644, "grad_norm": 0.35457987, "learning_rate": 1.932e-05, "elapsed_time_per_iteration": 6.62534595, "memory(GiB)": 21.51, "elapsed_time": "8h 38m 51s", "remaining_time": "2h 58m 8s", "loss_scale": 1.0, "consumed_samples": 1212416, "global_step/max_steps": "4736/6362"} +{"lm loss": 4.86473751, "grad_norm": 0.34610546, "learning_rate": 1.93e-05, "elapsed_time_per_iteration": 6.56025934, "memory(GiB)": 21.51, "elapsed_time": "8h 38m 58s", "remaining_time": "2h 58m 1s", "loss_scale": 1.0, "consumed_samples": 1212672, "global_step/max_steps": "4737/6362"} +{"lm loss": 4.8705225, "grad_norm": 0.372141, "learning_rate": 1.928e-05, "elapsed_time_per_iteration": 6.72503042, "memory(GiB)": 21.51, "elapsed_time": "8h 39m 4s", "remaining_time": "2h 57m 55s", "loss_scale": 1.0, "consumed_samples": 1212928, "global_step/max_steps": "4738/6362"} +{"lm loss": 4.87768888, "grad_norm": 0.3673715, "learning_rate": 1.926e-05, "elapsed_time_per_iteration": 6.7975142, "memory(GiB)": 21.51, "elapsed_time": "8h 39m 11s", "remaining_time": "2h 57m 48s", "loss_scale": 1.0, "consumed_samples": 1213184, "global_step/max_steps": "4739/6362"} +{"lm loss": 4.87359715, "grad_norm": 0.33355191, "learning_rate": 1.924e-05, "elapsed_time_per_iteration": 6.65118647, "memory(GiB)": 21.51, "elapsed_time": "8h 39m 18s", "remaining_time": "2h 57m 42s", "loss_scale": 1.0, "consumed_samples": 1213440, "global_step/max_steps": "4740/6362"} +{"lm loss": 4.85801697, "grad_norm": 0.3680492, "learning_rate": 1.922e-05, "elapsed_time_per_iteration": 6.53247309, "memory(GiB)": 21.51, "elapsed_time": "8h 39m 24s", "remaining_time": "2h 57m 35s", "loss_scale": 1.0, "consumed_samples": 1213696, "global_step/max_steps": "4741/6362"} +{"lm loss": 4.89570045, "grad_norm": 0.33721849, "learning_rate": 1.92e-05, "elapsed_time_per_iteration": 6.48785019, "memory(GiB)": 21.51, "elapsed_time": "8h 39m 31s", "remaining_time": "2h 57m 29s", "loss_scale": 1.0, "consumed_samples": 1213952, "global_step/max_steps": "4742/6362"} +{"lm loss": 4.87152958, "grad_norm": 0.34881842, "learning_rate": 1.918e-05, "elapsed_time_per_iteration": 6.44546199, "memory(GiB)": 21.51, "elapsed_time": "8h 39m 37s", "remaining_time": "2h 57m 22s", "loss_scale": 1.0, "consumed_samples": 1214208, "global_step/max_steps": "4743/6362"} +{"lm loss": 4.88215971, "grad_norm": 0.33465514, "learning_rate": 1.917e-05, "elapsed_time_per_iteration": 6.76564741, "memory(GiB)": 21.51, "elapsed_time": "8h 39m 44s", "remaining_time": "2h 57m 15s", "loss_scale": 1.0, "consumed_samples": 1214464, "global_step/max_steps": "4744/6362"} +{"lm loss": 4.90235376, "grad_norm": 0.33829206, "learning_rate": 1.915e-05, "elapsed_time_per_iteration": 6.52006364, "memory(GiB)": 21.51, "elapsed_time": "8h 39m 51s", "remaining_time": "2h 57m 9s", "loss_scale": 1.0, "consumed_samples": 1214720, "global_step/max_steps": "4745/6362"} +{"lm loss": 4.90562963, "grad_norm": 0.33106598, "learning_rate": 1.913e-05, "elapsed_time_per_iteration": 6.31082201, "memory(GiB)": 21.51, "elapsed_time": "8h 39m 57s", "remaining_time": "2h 57m 2s", "loss_scale": 1.0, "consumed_samples": 1214976, "global_step/max_steps": "4746/6362"} +{"lm loss": 4.89153385, "grad_norm": 0.32212862, "learning_rate": 1.911e-05, "elapsed_time_per_iteration": 6.52913451, "memory(GiB)": 21.51, "elapsed_time": "8h 40m 3s", "remaining_time": "2h 56m 56s", "loss_scale": 1.0, "consumed_samples": 1215232, "global_step/max_steps": "4747/6362"} +{"lm loss": 4.88919306, "grad_norm": 0.33204752, "learning_rate": 1.909e-05, "elapsed_time_per_iteration": 6.60994983, "memory(GiB)": 21.51, "elapsed_time": "8h 40m 10s", "remaining_time": "2h 56m 49s", "loss_scale": 1.0, "consumed_samples": 1215488, "global_step/max_steps": "4748/6362"} +{"lm loss": 4.87788582, "grad_norm": 0.34647265, "learning_rate": 1.907e-05, "elapsed_time_per_iteration": 6.66058445, "memory(GiB)": 21.51, "elapsed_time": "8h 40m 17s", "remaining_time": "2h 56m 42s", "loss_scale": 1.0, "consumed_samples": 1215744, "global_step/max_steps": "4749/6362"} +{"lm loss": 4.88313961, "grad_norm": 0.31873012, "learning_rate": 1.905e-05, "elapsed_time_per_iteration": 6.69499111, "memory(GiB)": 21.51, "elapsed_time": "8h 40m 23s", "remaining_time": "2h 56m 36s", "loss_scale": 1.0, "consumed_samples": 1216000, "global_step/max_steps": "4750/6362"} +{"lm loss": 4.89549065, "grad_norm": 0.33982107, "learning_rate": 1.903e-05, "elapsed_time_per_iteration": 6.31306672, "memory(GiB)": 21.51, "elapsed_time": "8h 40m 30s", "remaining_time": "2h 56m 29s", "loss_scale": 1.0, "consumed_samples": 1216256, "global_step/max_steps": "4751/6362"} +{"lm loss": 4.88079739, "grad_norm": 0.36139339, "learning_rate": 1.902e-05, "elapsed_time_per_iteration": 6.60354471, "memory(GiB)": 21.51, "elapsed_time": "8h 40m 36s", "remaining_time": "2h 56m 23s", "loss_scale": 1.0, "consumed_samples": 1216512, "global_step/max_steps": "4752/6362"} +{"lm loss": 4.90285635, "grad_norm": 0.33498353, "learning_rate": 1.9e-05, "elapsed_time_per_iteration": 6.53091121, "memory(GiB)": 21.51, "elapsed_time": "8h 40m 43s", "remaining_time": "2h 56m 16s", "loss_scale": 1.0, "consumed_samples": 1216768, "global_step/max_steps": "4753/6362"} +{"lm loss": 4.88754034, "grad_norm": 0.36402282, "learning_rate": 1.898e-05, "elapsed_time_per_iteration": 6.59348941, "memory(GiB)": 21.51, "elapsed_time": "8h 40m 49s", "remaining_time": "2h 56m 10s", "loss_scale": 1.0, "consumed_samples": 1217024, "global_step/max_steps": "4754/6362"} +{"lm loss": 4.86172581, "grad_norm": 0.34859616, "learning_rate": 1.896e-05, "elapsed_time_per_iteration": 6.64482117, "memory(GiB)": 21.51, "elapsed_time": "8h 40m 56s", "remaining_time": "2h 56m 3s", "loss_scale": 1.0, "consumed_samples": 1217280, "global_step/max_steps": "4755/6362"} +{"lm loss": 4.88013363, "grad_norm": 0.37910005, "learning_rate": 1.894e-05, "elapsed_time_per_iteration": 6.480654, "memory(GiB)": 21.51, "elapsed_time": "8h 41m 3s", "remaining_time": "2h 55m 56s", "loss_scale": 1.0, "consumed_samples": 1217536, "global_step/max_steps": "4756/6362"} +{"lm loss": 4.85539627, "grad_norm": 0.32936397, "learning_rate": 1.892e-05, "elapsed_time_per_iteration": 6.71861219, "memory(GiB)": 21.51, "elapsed_time": "8h 41m 9s", "remaining_time": "2h 55m 50s", "loss_scale": 1.0, "consumed_samples": 1217792, "global_step/max_steps": "4757/6362"} +{"lm loss": 4.88883066, "grad_norm": 0.37946659, "learning_rate": 1.89e-05, "elapsed_time_per_iteration": 6.63008022, "memory(GiB)": 21.51, "elapsed_time": "8h 41m 16s", "remaining_time": "2h 55m 43s", "loss_scale": 1.0, "consumed_samples": 1218048, "global_step/max_steps": "4758/6362"} +{"lm loss": 4.89963388, "grad_norm": 0.3902629, "learning_rate": 1.888e-05, "elapsed_time_per_iteration": 6.59818745, "memory(GiB)": 21.51, "elapsed_time": "8h 41m 23s", "remaining_time": "2h 55m 37s", "loss_scale": 1.0, "consumed_samples": 1218304, "global_step/max_steps": "4759/6362"} +{"lm loss": 4.89285517, "grad_norm": 0.31044936, "learning_rate": 1.887e-05, "elapsed_time_per_iteration": 6.65221143, "memory(GiB)": 21.51, "elapsed_time": "8h 41m 29s", "remaining_time": "2h 55m 30s", "loss_scale": 1.0, "consumed_samples": 1218560, "global_step/max_steps": "4760/6362"} +{"lm loss": 4.89436579, "grad_norm": 0.39318421, "learning_rate": 1.885e-05, "elapsed_time_per_iteration": 6.44100952, "memory(GiB)": 21.51, "elapsed_time": "8h 41m 36s", "remaining_time": "2h 55m 24s", "loss_scale": 1.0, "consumed_samples": 1218816, "global_step/max_steps": "4761/6362"} +{"lm loss": 4.89266491, "grad_norm": 0.35007033, "learning_rate": 1.883e-05, "elapsed_time_per_iteration": 6.30759811, "memory(GiB)": 21.51, "elapsed_time": "8h 41m 42s", "remaining_time": "2h 55m 17s", "loss_scale": 1.0, "consumed_samples": 1219072, "global_step/max_steps": "4762/6362"} +{"lm loss": 4.86992311, "grad_norm": 0.35766518, "learning_rate": 1.881e-05, "elapsed_time_per_iteration": 6.77197361, "memory(GiB)": 21.51, "elapsed_time": "8h 41m 49s", "remaining_time": "2h 55m 10s", "loss_scale": 1.0, "consumed_samples": 1219328, "global_step/max_steps": "4763/6362"} +{"lm loss": 4.87306166, "grad_norm": 0.36378527, "learning_rate": 1.879e-05, "elapsed_time_per_iteration": 6.54399252, "memory(GiB)": 21.51, "elapsed_time": "8h 41m 55s", "remaining_time": "2h 55m 4s", "loss_scale": 1.0, "consumed_samples": 1219584, "global_step/max_steps": "4764/6362"} +{"lm loss": 4.89857578, "grad_norm": 0.34435287, "learning_rate": 1.877e-05, "elapsed_time_per_iteration": 6.45540714, "memory(GiB)": 21.51, "elapsed_time": "8h 42m 2s", "remaining_time": "2h 54m 57s", "loss_scale": 1.0, "consumed_samples": 1219840, "global_step/max_steps": "4765/6362"} +{"lm loss": 4.87320423, "grad_norm": 0.35606351, "learning_rate": 1.875e-05, "elapsed_time_per_iteration": 6.47861671, "memory(GiB)": 21.51, "elapsed_time": "8h 42m 8s", "remaining_time": "2h 54m 51s", "loss_scale": 1.0, "consumed_samples": 1220096, "global_step/max_steps": "4766/6362"} +{"lm loss": 4.86223316, "grad_norm": 0.37123692, "learning_rate": 1.874e-05, "elapsed_time_per_iteration": 6.46934462, "memory(GiB)": 21.51, "elapsed_time": "8h 42m 15s", "remaining_time": "2h 54m 44s", "loss_scale": 1.0, "consumed_samples": 1220352, "global_step/max_steps": "4767/6362"} +{"lm loss": 4.88566065, "grad_norm": 0.36076531, "learning_rate": 1.872e-05, "elapsed_time_per_iteration": 6.79577518, "memory(GiB)": 21.51, "elapsed_time": "8h 42m 21s", "remaining_time": "2h 54m 37s", "loss_scale": 1.0, "consumed_samples": 1220608, "global_step/max_steps": "4768/6362"} +{"lm loss": 4.90139866, "grad_norm": 0.33030954, "learning_rate": 1.87e-05, "elapsed_time_per_iteration": 6.65987158, "memory(GiB)": 21.51, "elapsed_time": "8h 42m 28s", "remaining_time": "2h 54m 31s", "loss_scale": 1.0, "consumed_samples": 1220864, "global_step/max_steps": "4769/6362"} +{"lm loss": 4.88788748, "grad_norm": 0.33590874, "learning_rate": 1.868e-05, "elapsed_time_per_iteration": 6.57363057, "memory(GiB)": 21.51, "elapsed_time": "8h 42m 35s", "remaining_time": "2h 54m 24s", "loss_scale": 1.0, "consumed_samples": 1221120, "global_step/max_steps": "4770/6362"} +{"lm loss": 4.88862705, "grad_norm": 0.35435337, "learning_rate": 1.866e-05, "elapsed_time_per_iteration": 6.57431054, "memory(GiB)": 21.51, "elapsed_time": "8h 42m 41s", "remaining_time": "2h 54m 18s", "loss_scale": 1.0, "consumed_samples": 1221376, "global_step/max_steps": "4771/6362"} +{"lm loss": 4.88871765, "grad_norm": 0.34017795, "learning_rate": 1.864e-05, "elapsed_time_per_iteration": 6.53706002, "memory(GiB)": 21.51, "elapsed_time": "8h 42m 48s", "remaining_time": "2h 54m 11s", "loss_scale": 1.0, "consumed_samples": 1221632, "global_step/max_steps": "4772/6362"} +{"lm loss": 4.88968086, "grad_norm": 0.34589484, "learning_rate": 1.862e-05, "elapsed_time_per_iteration": 6.31186628, "memory(GiB)": 21.51, "elapsed_time": "8h 42m 54s", "remaining_time": "2h 54m 5s", "loss_scale": 1.0, "consumed_samples": 1221888, "global_step/max_steps": "4773/6362"} +{"lm loss": 4.87853098, "grad_norm": 0.35679567, "learning_rate": 1.861e-05, "elapsed_time_per_iteration": 6.53732443, "memory(GiB)": 21.51, "elapsed_time": "8h 43m 1s", "remaining_time": "2h 53m 58s", "loss_scale": 1.0, "consumed_samples": 1222144, "global_step/max_steps": "4774/6362"} +{"lm loss": 4.87110806, "grad_norm": 0.32826051, "learning_rate": 1.859e-05, "elapsed_time_per_iteration": 6.70978022, "memory(GiB)": 21.51, "elapsed_time": "8h 43m 7s", "remaining_time": "2h 53m 51s", "loss_scale": 1.0, "consumed_samples": 1222400, "global_step/max_steps": "4775/6362"} +{"lm loss": 4.86691427, "grad_norm": 0.34787402, "learning_rate": 1.857e-05, "elapsed_time_per_iteration": 6.62635469, "memory(GiB)": 21.51, "elapsed_time": "8h 43m 14s", "remaining_time": "2h 53m 45s", "loss_scale": 1.0, "consumed_samples": 1222656, "global_step/max_steps": "4776/6362"} +{"lm loss": 4.88249302, "grad_norm": 0.35627452, "learning_rate": 1.855e-05, "elapsed_time_per_iteration": 6.53431487, "memory(GiB)": 21.51, "elapsed_time": "8h 43m 21s", "remaining_time": "2h 53m 38s", "loss_scale": 1.0, "consumed_samples": 1222912, "global_step/max_steps": "4777/6362"} +{"lm loss": 4.87258053, "grad_norm": 0.3647525, "learning_rate": 1.853e-05, "elapsed_time_per_iteration": 6.62312269, "memory(GiB)": 21.51, "elapsed_time": "8h 43m 27s", "remaining_time": "2h 53m 32s", "loss_scale": 1.0, "consumed_samples": 1223168, "global_step/max_steps": "4778/6362"} +{"lm loss": 4.87656116, "grad_norm": 0.36264491, "learning_rate": 1.851e-05, "elapsed_time_per_iteration": 6.60101533, "memory(GiB)": 21.51, "elapsed_time": "8h 43m 34s", "remaining_time": "2h 53m 25s", "loss_scale": 1.0, "consumed_samples": 1223424, "global_step/max_steps": "4779/6362"} +{"lm loss": 4.90607738, "grad_norm": 0.43700954, "learning_rate": 1.849e-05, "elapsed_time_per_iteration": 6.57239437, "memory(GiB)": 21.51, "elapsed_time": "8h 43m 40s", "remaining_time": "2h 53m 19s", "loss_scale": 1.0, "consumed_samples": 1223680, "global_step/max_steps": "4780/6362"} +{"lm loss": 4.88128948, "grad_norm": 0.35968128, "learning_rate": 1.848e-05, "elapsed_time_per_iteration": 6.38787532, "memory(GiB)": 21.51, "elapsed_time": "8h 43m 47s", "remaining_time": "2h 53m 12s", "loss_scale": 1.0, "consumed_samples": 1223936, "global_step/max_steps": "4781/6362"} +{"lm loss": 4.89794064, "grad_norm": 0.39126596, "learning_rate": 1.846e-05, "elapsed_time_per_iteration": 6.41423535, "memory(GiB)": 21.51, "elapsed_time": "8h 43m 53s", "remaining_time": "2h 53m 5s", "loss_scale": 1.0, "consumed_samples": 1224192, "global_step/max_steps": "4782/6362"} +{"lm loss": 4.90799427, "grad_norm": 0.3633039, "learning_rate": 1.844e-05, "elapsed_time_per_iteration": 6.5057559, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 0s", "remaining_time": "2h 52m 59s", "loss_scale": 1.0, "consumed_samples": 1224448, "global_step/max_steps": "4783/6362"} +{"lm loss": 4.90081501, "grad_norm": 0.35501266, "learning_rate": 1.842e-05, "elapsed_time_per_iteration": 6.48818183, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 6s", "remaining_time": "2h 52m 52s", "loss_scale": 1.0, "consumed_samples": 1224704, "global_step/max_steps": "4784/6362"} +{"lm loss": 4.87520266, "grad_norm": 0.3766858, "learning_rate": 1.84e-05, "elapsed_time_per_iteration": 6.31383419, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 12s", "remaining_time": "2h 52m 45s", "loss_scale": 1.0, "consumed_samples": 1224960, "global_step/max_steps": "4785/6362"} +{"lm loss": 4.86929369, "grad_norm": 0.35779595, "learning_rate": 1.838e-05, "elapsed_time_per_iteration": 6.57549644, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 19s", "remaining_time": "2h 52m 39s", "loss_scale": 1.0, "consumed_samples": 1225216, "global_step/max_steps": "4786/6362"} +{"lm loss": 4.87435436, "grad_norm": 0.35834965, "learning_rate": 1.837e-05, "elapsed_time_per_iteration": 6.81894374, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 26s", "remaining_time": "2h 52m 32s", "loss_scale": 1.0, "consumed_samples": 1225472, "global_step/max_steps": "4787/6362"} +{"lm loss": 4.8770175, "grad_norm": 0.34765545, "learning_rate": 1.835e-05, "elapsed_time_per_iteration": 6.60834146, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 32s", "remaining_time": "2h 52m 26s", "loss_scale": 1.0, "consumed_samples": 1225728, "global_step/max_steps": "4788/6362"} +{"lm loss": 4.87746334, "grad_norm": 0.33847636, "learning_rate": 1.833e-05, "elapsed_time_per_iteration": 6.48332715, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 39s", "remaining_time": "2h 52m 19s", "loss_scale": 1.0, "consumed_samples": 1225984, "global_step/max_steps": "4789/6362"} +{"lm loss": 4.90465069, "grad_norm": 0.34627622, "learning_rate": 1.831e-05, "elapsed_time_per_iteration": 6.61864042, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 46s", "remaining_time": "2h 52m 13s", "loss_scale": 1.0, "consumed_samples": 1226240, "global_step/max_steps": "4790/6362"} +{"lm loss": 4.86649704, "grad_norm": 0.34515291, "learning_rate": 1.829e-05, "elapsed_time_per_iteration": 6.416924, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 52s", "remaining_time": "2h 52m 6s", "loss_scale": 1.0, "consumed_samples": 1226496, "global_step/max_steps": "4791/6362"} +{"lm loss": 4.89967632, "grad_norm": 0.36874285, "learning_rate": 1.827e-05, "elapsed_time_per_iteration": 6.60104012, "memory(GiB)": 21.51, "elapsed_time": "8h 44m 59s", "remaining_time": "2h 52m 0s", "loss_scale": 1.0, "consumed_samples": 1226752, "global_step/max_steps": "4792/6362"} +{"lm loss": 4.89862823, "grad_norm": 0.34458935, "learning_rate": 1.826e-05, "elapsed_time_per_iteration": 6.54681492, "memory(GiB)": 21.51, "elapsed_time": "8h 45m 5s", "remaining_time": "2h 51m 53s", "loss_scale": 1.0, "consumed_samples": 1227008, "global_step/max_steps": "4793/6362"} +{"lm loss": 4.87044382, "grad_norm": 0.38954055, "learning_rate": 1.824e-05, "elapsed_time_per_iteration": 6.51624107, "memory(GiB)": 21.51, "elapsed_time": "8h 45m 12s", "remaining_time": "2h 51m 46s", "loss_scale": 1.0, "consumed_samples": 1227264, "global_step/max_steps": "4794/6362"} +{"lm loss": 4.88728762, "grad_norm": 0.36735138, "learning_rate": 1.822e-05, "elapsed_time_per_iteration": 6.33565378, "memory(GiB)": 21.51, "elapsed_time": "8h 45m 18s", "remaining_time": "2h 51m 40s", "loss_scale": 1.0, "consumed_samples": 1227520, "global_step/max_steps": "4795/6362"} +{"lm loss": 4.88554287, "grad_norm": 0.37076613, "learning_rate": 1.82e-05, "elapsed_time_per_iteration": 6.729321, "memory(GiB)": 21.51, "elapsed_time": "8h 45m 25s", "remaining_time": "2h 51m 33s", "loss_scale": 1.0, "consumed_samples": 1227776, "global_step/max_steps": "4796/6362"} +{"lm loss": 4.8682909, "grad_norm": 0.3903378, "learning_rate": 1.818e-05, "elapsed_time_per_iteration": 6.77699018, "memory(GiB)": 21.51, "elapsed_time": "8h 45m 31s", "remaining_time": "2h 51m 27s", "loss_scale": 1.0, "consumed_samples": 1228032, "global_step/max_steps": "4797/6362"} +{"lm loss": 4.90475273, "grad_norm": 0.34801683, "learning_rate": 1.816e-05, "elapsed_time_per_iteration": 6.35351205, "memory(GiB)": 21.51, "elapsed_time": "8h 45m 38s", "remaining_time": "2h 51m 20s", "loss_scale": 1.0, "consumed_samples": 1228288, "global_step/max_steps": "4798/6362"} +{"lm loss": 4.89945173, "grad_norm": 0.44003853, "learning_rate": 1.815e-05, "elapsed_time_per_iteration": 6.57082462, "memory(GiB)": 21.51, "elapsed_time": "8h 45m 44s", "remaining_time": "2h 51m 13s", "loss_scale": 1.0, "consumed_samples": 1228544, "global_step/max_steps": "4799/6362"} +{"lm loss": 4.90244102, "grad_norm": 0.3708528, "learning_rate": 1.813e-05, "elapsed_time_per_iteration": 6.67494488, "memory(GiB)": 21.51, "elapsed_time": "8h 45m 51s", "remaining_time": "2h 51m 7s", "loss_scale": 1.0, "consumed_samples": 1228800, "global_step/max_steps": "4800/6362"} +{"lm loss": 4.88938522, "grad_norm": 0.38659, "learning_rate": 1.811e-05, "elapsed_time_per_iteration": 6.60176253, "memory(GiB)": 21.51, "elapsed_time": "8h 45m 58s", "remaining_time": "2h 51m 0s", "loss_scale": 1.0, "consumed_samples": 1229056, "global_step/max_steps": "4801/6362"} +{"lm loss": 4.89921761, "grad_norm": 0.34121886, "learning_rate": 1.809e-05, "elapsed_time_per_iteration": 6.57377458, "memory(GiB)": 21.51, "elapsed_time": "8h 46m 4s", "remaining_time": "2h 50m 54s", "loss_scale": 1.0, "consumed_samples": 1229312, "global_step/max_steps": "4802/6362"} +{"lm loss": 4.9074626, "grad_norm": 0.36466825, "learning_rate": 1.807e-05, "elapsed_time_per_iteration": 6.60152578, "memory(GiB)": 21.51, "elapsed_time": "8h 46m 11s", "remaining_time": "2h 50m 47s", "loss_scale": 1.0, "consumed_samples": 1229568, "global_step/max_steps": "4803/6362"} +{"lm loss": 4.88073158, "grad_norm": 0.35812983, "learning_rate": 1.805e-05, "elapsed_time_per_iteration": 6.57727122, "memory(GiB)": 21.51, "elapsed_time": "8h 46m 17s", "remaining_time": "2h 50m 41s", "loss_scale": 1.0, "consumed_samples": 1229824, "global_step/max_steps": "4804/6362"} +{"lm loss": 4.86060572, "grad_norm": 0.34467483, "learning_rate": 1.804e-05, "elapsed_time_per_iteration": 6.72661519, "memory(GiB)": 21.51, "elapsed_time": "8h 46m 24s", "remaining_time": "2h 50m 34s", "loss_scale": 1.0, "consumed_samples": 1230080, "global_step/max_steps": "4805/6362"} +{"lm loss": 4.86415768, "grad_norm": 0.3564916, "learning_rate": 1.802e-05, "elapsed_time_per_iteration": 6.76092696, "memory(GiB)": 21.51, "elapsed_time": "8h 46m 31s", "remaining_time": "2h 50m 28s", "loss_scale": 1.0, "consumed_samples": 1230336, "global_step/max_steps": "4806/6362"} +{"lm loss": 4.88328314, "grad_norm": 0.35973427, "learning_rate": 1.8e-05, "elapsed_time_per_iteration": 6.7937336, "memory(GiB)": 21.51, "elapsed_time": "8h 46m 38s", "remaining_time": "2h 50m 21s", "loss_scale": 1.0, "consumed_samples": 1230592, "global_step/max_steps": "4807/6362"} +{"lm loss": 4.90443563, "grad_norm": 0.32744184, "learning_rate": 1.798e-05, "elapsed_time_per_iteration": 6.65581131, "memory(GiB)": 21.51, "elapsed_time": "8h 46m 44s", "remaining_time": "2h 50m 15s", "loss_scale": 1.0, "consumed_samples": 1230848, "global_step/max_steps": "4808/6362"} +{"lm loss": 4.86713982, "grad_norm": 0.37687191, "learning_rate": 1.796e-05, "elapsed_time_per_iteration": 6.62058067, "memory(GiB)": 21.51, "elapsed_time": "8h 46m 51s", "remaining_time": "2h 50m 8s", "loss_scale": 1.0, "consumed_samples": 1231104, "global_step/max_steps": "4809/6362"} +{"lm loss": 4.86759996, "grad_norm": 0.37205887, "learning_rate": 1.794e-05, "elapsed_time_per_iteration": 6.50626278, "memory(GiB)": 21.51, "elapsed_time": "8h 46m 57s", "remaining_time": "2h 50m 1s", "loss_scale": 1.0, "consumed_samples": 1231360, "global_step/max_steps": "4810/6362"} +{"lm loss": 4.87661982, "grad_norm": 0.37262255, "learning_rate": 1.793e-05, "elapsed_time_per_iteration": 6.58083391, "memory(GiB)": 21.51, "elapsed_time": "8h 47m 4s", "remaining_time": "2h 49m 55s", "loss_scale": 1.0, "consumed_samples": 1231616, "global_step/max_steps": "4811/6362"} +{"lm loss": 4.86809874, "grad_norm": 0.36946616, "learning_rate": 1.791e-05, "elapsed_time_per_iteration": 6.57514238, "memory(GiB)": 21.51, "elapsed_time": "8h 47m 11s", "remaining_time": "2h 49m 48s", "loss_scale": 1.0, "consumed_samples": 1231872, "global_step/max_steps": "4812/6362"} +{"lm loss": 4.89954567, "grad_norm": 0.41028953, "learning_rate": 1.789e-05, "elapsed_time_per_iteration": 6.46631455, "memory(GiB)": 21.51, "elapsed_time": "8h 47m 17s", "remaining_time": "2h 49m 42s", "loss_scale": 1.0, "consumed_samples": 1232128, "global_step/max_steps": "4813/6362"} +{"lm loss": 4.88992453, "grad_norm": 0.3815718, "learning_rate": 1.787e-05, "elapsed_time_per_iteration": 6.46063852, "memory(GiB)": 21.51, "elapsed_time": "8h 47m 24s", "remaining_time": "2h 49m 35s", "loss_scale": 1.0, "consumed_samples": 1232384, "global_step/max_steps": "4814/6362"} +{"lm loss": 4.88006496, "grad_norm": 0.39188585, "learning_rate": 1.785e-05, "elapsed_time_per_iteration": 6.66556501, "memory(GiB)": 21.51, "elapsed_time": "8h 47m 30s", "remaining_time": "2h 49m 28s", "loss_scale": 1.0, "consumed_samples": 1232640, "global_step/max_steps": "4815/6362"} +{"lm loss": 4.88760805, "grad_norm": 0.36273053, "learning_rate": 1.784e-05, "elapsed_time_per_iteration": 6.64319181, "memory(GiB)": 21.51, "elapsed_time": "8h 47m 37s", "remaining_time": "2h 49m 22s", "loss_scale": 1.0, "consumed_samples": 1232896, "global_step/max_steps": "4816/6362"} +{"lm loss": 4.88438559, "grad_norm": 0.36423853, "learning_rate": 1.782e-05, "elapsed_time_per_iteration": 6.56752443, "memory(GiB)": 21.51, "elapsed_time": "8h 47m 43s", "remaining_time": "2h 49m 15s", "loss_scale": 1.0, "consumed_samples": 1233152, "global_step/max_steps": "4817/6362"} +{"lm loss": 4.89865685, "grad_norm": 0.36265314, "learning_rate": 1.78e-05, "elapsed_time_per_iteration": 6.46229601, "memory(GiB)": 21.51, "elapsed_time": "8h 47m 50s", "remaining_time": "2h 49m 9s", "loss_scale": 1.0, "consumed_samples": 1233408, "global_step/max_steps": "4818/6362"} +{"lm loss": 4.89197588, "grad_norm": 0.39637333, "learning_rate": 1.778e-05, "elapsed_time_per_iteration": 6.47082567, "memory(GiB)": 21.51, "elapsed_time": "8h 47m 56s", "remaining_time": "2h 49m 2s", "loss_scale": 1.0, "consumed_samples": 1233664, "global_step/max_steps": "4819/6362"} +{"lm loss": 4.88102102, "grad_norm": 0.34665763, "learning_rate": 1.776e-05, "elapsed_time_per_iteration": 6.74850297, "memory(GiB)": 21.51, "elapsed_time": "8h 48m 3s", "remaining_time": "2h 48m 56s", "loss_scale": 1.0, "consumed_samples": 1233920, "global_step/max_steps": "4820/6362"} +{"lm loss": 4.8672123, "grad_norm": 0.38056681, "learning_rate": 1.774e-05, "elapsed_time_per_iteration": 6.7537303, "memory(GiB)": 21.51, "elapsed_time": "8h 48m 10s", "remaining_time": "2h 48m 49s", "loss_scale": 1.0, "consumed_samples": 1234176, "global_step/max_steps": "4821/6362"} +{"lm loss": 4.8614254, "grad_norm": 0.33900958, "learning_rate": 1.773e-05, "elapsed_time_per_iteration": 6.4658494, "memory(GiB)": 21.51, "elapsed_time": "8h 48m 16s", "remaining_time": "2h 48m 42s", "loss_scale": 1.0, "consumed_samples": 1234432, "global_step/max_steps": "4822/6362"} +{"lm loss": 4.88259268, "grad_norm": 0.35889402, "learning_rate": 1.771e-05, "elapsed_time_per_iteration": 6.42106509, "memory(GiB)": 21.51, "elapsed_time": "8h 48m 23s", "remaining_time": "2h 48m 36s", "loss_scale": 1.0, "consumed_samples": 1234688, "global_step/max_steps": "4823/6362"} +{"lm loss": 4.89150429, "grad_norm": 0.32295677, "learning_rate": 1.769e-05, "elapsed_time_per_iteration": 6.72056389, "memory(GiB)": 21.51, "elapsed_time": "8h 48m 29s", "remaining_time": "2h 48m 29s", "loss_scale": 1.0, "consumed_samples": 1234944, "global_step/max_steps": "4824/6362"} +{"lm loss": 4.89121151, "grad_norm": 0.35290074, "learning_rate": 1.767e-05, "elapsed_time_per_iteration": 6.58493042, "memory(GiB)": 21.51, "elapsed_time": "8h 48m 36s", "remaining_time": "2h 48m 23s", "loss_scale": 1.0, "consumed_samples": 1235200, "global_step/max_steps": "4825/6362"} +{"lm loss": 4.86867809, "grad_norm": 0.32867736, "learning_rate": 1.765e-05, "elapsed_time_per_iteration": 6.49798203, "memory(GiB)": 21.51, "elapsed_time": "8h 48m 43s", "remaining_time": "2h 48m 16s", "loss_scale": 1.0, "consumed_samples": 1235456, "global_step/max_steps": "4826/6362"} +{"lm loss": 4.87945318, "grad_norm": 0.34562653, "learning_rate": 1.764e-05, "elapsed_time_per_iteration": 6.66885185, "memory(GiB)": 21.51, "elapsed_time": "8h 48m 49s", "remaining_time": "2h 48m 10s", "loss_scale": 1.0, "consumed_samples": 1235712, "global_step/max_steps": "4827/6362"} +{"lm loss": 4.88129139, "grad_norm": 0.34112734, "learning_rate": 1.762e-05, "elapsed_time_per_iteration": 6.70807409, "memory(GiB)": 21.51, "elapsed_time": "8h 48m 56s", "remaining_time": "2h 48m 3s", "loss_scale": 1.0, "consumed_samples": 1235968, "global_step/max_steps": "4828/6362"} +{"lm loss": 4.8821969, "grad_norm": 0.32600474, "learning_rate": 1.76e-05, "elapsed_time_per_iteration": 6.44662786, "memory(GiB)": 21.51, "elapsed_time": "8h 49m 2s", "remaining_time": "2h 47m 56s", "loss_scale": 1.0, "consumed_samples": 1236224, "global_step/max_steps": "4829/6362"} +{"lm loss": 4.90223789, "grad_norm": 0.34467468, "learning_rate": 1.758e-05, "elapsed_time_per_iteration": 6.56930757, "memory(GiB)": 21.51, "elapsed_time": "8h 49m 9s", "remaining_time": "2h 47m 50s", "loss_scale": 1.0, "consumed_samples": 1236480, "global_step/max_steps": "4830/6362"} +{"lm loss": 4.8916316, "grad_norm": 0.31886557, "learning_rate": 1.756e-05, "elapsed_time_per_iteration": 6.74645686, "memory(GiB)": 21.51, "elapsed_time": "8h 49m 16s", "remaining_time": "2h 47m 43s", "loss_scale": 1.0, "consumed_samples": 1236736, "global_step/max_steps": "4831/6362"} +{"lm loss": 4.86582851, "grad_norm": 0.34894887, "learning_rate": 1.755e-05, "elapsed_time_per_iteration": 6.56713057, "memory(GiB)": 21.51, "elapsed_time": "8h 49m 22s", "remaining_time": "2h 47m 37s", "loss_scale": 1.0, "consumed_samples": 1236992, "global_step/max_steps": "4832/6362"} +{"lm loss": 4.88894701, "grad_norm": 0.32474077, "learning_rate": 1.753e-05, "elapsed_time_per_iteration": 6.58597112, "memory(GiB)": 21.51, "elapsed_time": "8h 49m 29s", "remaining_time": "2h 47m 30s", "loss_scale": 1.0, "consumed_samples": 1237248, "global_step/max_steps": "4833/6362"} +{"lm loss": 4.88564014, "grad_norm": 0.34647328, "learning_rate": 1.751e-05, "elapsed_time_per_iteration": 6.58922267, "memory(GiB)": 21.51, "elapsed_time": "8h 49m 35s", "remaining_time": "2h 47m 24s", "loss_scale": 1.0, "consumed_samples": 1237504, "global_step/max_steps": "4834/6362"} +{"lm loss": 4.87151146, "grad_norm": 0.32418561, "learning_rate": 1.749e-05, "elapsed_time_per_iteration": 6.63371301, "memory(GiB)": 21.51, "elapsed_time": "8h 49m 42s", "remaining_time": "2h 47m 17s", "loss_scale": 1.0, "consumed_samples": 1237760, "global_step/max_steps": "4835/6362"} +{"lm loss": 4.8948555, "grad_norm": 0.34090045, "learning_rate": 1.747e-05, "elapsed_time_per_iteration": 6.53049326, "memory(GiB)": 21.51, "elapsed_time": "8h 49m 49s", "remaining_time": "2h 47m 11s", "loss_scale": 1.0, "consumed_samples": 1238016, "global_step/max_steps": "4836/6362"} +{"lm loss": 4.88371134, "grad_norm": 0.34360701, "learning_rate": 1.746e-05, "elapsed_time_per_iteration": 6.49457836, "memory(GiB)": 21.51, "elapsed_time": "8h 49m 55s", "remaining_time": "2h 47m 4s", "loss_scale": 1.0, "consumed_samples": 1238272, "global_step/max_steps": "4837/6362"} +{"lm loss": 4.87248135, "grad_norm": 0.31174403, "learning_rate": 1.744e-05, "elapsed_time_per_iteration": 6.48608518, "memory(GiB)": 21.51, "elapsed_time": "8h 50m 2s", "remaining_time": "2h 46m 57s", "loss_scale": 1.0, "consumed_samples": 1238528, "global_step/max_steps": "4838/6362"} +{"lm loss": 4.90014839, "grad_norm": 0.3707602, "learning_rate": 1.742e-05, "elapsed_time_per_iteration": 6.68238544, "memory(GiB)": 21.51, "elapsed_time": "8h 50m 8s", "remaining_time": "2h 46m 51s", "loss_scale": 1.0, "consumed_samples": 1238784, "global_step/max_steps": "4839/6362"} +{"lm loss": 4.89406776, "grad_norm": 0.34285623, "learning_rate": 1.74e-05, "elapsed_time_per_iteration": 6.63519788, "memory(GiB)": 21.51, "elapsed_time": "8h 50m 15s", "remaining_time": "2h 46m 44s", "loss_scale": 1.0, "consumed_samples": 1239040, "global_step/max_steps": "4840/6362"} +{"lm loss": 4.8872571, "grad_norm": 0.33541864, "learning_rate": 1.738e-05, "elapsed_time_per_iteration": 6.58763385, "memory(GiB)": 21.51, "elapsed_time": "8h 50m 21s", "remaining_time": "2h 46m 38s", "loss_scale": 1.0, "consumed_samples": 1239296, "global_step/max_steps": "4841/6362"} +{"lm loss": 4.87087727, "grad_norm": 0.36466709, "learning_rate": 1.737e-05, "elapsed_time_per_iteration": 6.47196507, "memory(GiB)": 21.51, "elapsed_time": "8h 50m 28s", "remaining_time": "2h 46m 31s", "loss_scale": 1.0, "consumed_samples": 1239552, "global_step/max_steps": "4842/6362"} +{"lm loss": 4.86591625, "grad_norm": 0.349316, "learning_rate": 1.735e-05, "elapsed_time_per_iteration": 6.65829086, "memory(GiB)": 21.51, "elapsed_time": "8h 50m 35s", "remaining_time": "2h 46m 25s", "loss_scale": 1.0, "consumed_samples": 1239808, "global_step/max_steps": "4843/6362"} +{"lm loss": 4.87849426, "grad_norm": 0.40522999, "learning_rate": 1.733e-05, "elapsed_time_per_iteration": 6.5707798, "memory(GiB)": 21.51, "elapsed_time": "8h 50m 41s", "remaining_time": "2h 46m 18s", "loss_scale": 1.0, "consumed_samples": 1240064, "global_step/max_steps": "4844/6362"} +{"lm loss": 4.88735151, "grad_norm": 0.35485721, "learning_rate": 1.731e-05, "elapsed_time_per_iteration": 6.52901077, "memory(GiB)": 21.51, "elapsed_time": "8h 50m 48s", "remaining_time": "2h 46m 11s", "loss_scale": 1.0, "consumed_samples": 1240320, "global_step/max_steps": "4845/6362"} +{"lm loss": 4.87098551, "grad_norm": 0.40037176, "learning_rate": 1.73e-05, "elapsed_time_per_iteration": 6.6316483, "memory(GiB)": 21.51, "elapsed_time": "8h 50m 54s", "remaining_time": "2h 46m 5s", "loss_scale": 1.0, "consumed_samples": 1240576, "global_step/max_steps": "4846/6362"} +{"lm loss": 4.87798071, "grad_norm": 0.35637471, "learning_rate": 1.728e-05, "elapsed_time_per_iteration": 6.64916539, "memory(GiB)": 21.51, "elapsed_time": "8h 51m 1s", "remaining_time": "2h 45m 58s", "loss_scale": 1.0, "consumed_samples": 1240832, "global_step/max_steps": "4847/6362"} +{"lm loss": 4.88737392, "grad_norm": 0.36624143, "learning_rate": 1.726e-05, "elapsed_time_per_iteration": 6.58680153, "memory(GiB)": 21.51, "elapsed_time": "8h 51m 8s", "remaining_time": "2h 45m 52s", "loss_scale": 1.0, "consumed_samples": 1241088, "global_step/max_steps": "4848/6362"} +{"lm loss": 4.87156296, "grad_norm": 0.35903832, "learning_rate": 1.724e-05, "elapsed_time_per_iteration": 6.57652092, "memory(GiB)": 21.51, "elapsed_time": "8h 51m 14s", "remaining_time": "2h 45m 45s", "loss_scale": 1.0, "consumed_samples": 1241344, "global_step/max_steps": "4849/6362"} +{"lm loss": 4.86679125, "grad_norm": 0.35436904, "learning_rate": 1.722e-05, "elapsed_time_per_iteration": 6.74261236, "memory(GiB)": 21.51, "elapsed_time": "8h 51m 21s", "remaining_time": "2h 45m 39s", "loss_scale": 1.0, "consumed_samples": 1241600, "global_step/max_steps": "4850/6362"} +{"lm loss": 4.88374805, "grad_norm": 0.39927176, "learning_rate": 1.721e-05, "elapsed_time_per_iteration": 6.63254809, "memory(GiB)": 21.51, "elapsed_time": "8h 51m 28s", "remaining_time": "2h 45m 32s", "loss_scale": 1.0, "consumed_samples": 1241856, "global_step/max_steps": "4851/6362"} +{"lm loss": 4.89369869, "grad_norm": 0.35050344, "learning_rate": 1.719e-05, "elapsed_time_per_iteration": 6.72691894, "memory(GiB)": 21.51, "elapsed_time": "8h 51m 34s", "remaining_time": "2h 45m 26s", "loss_scale": 1.0, "consumed_samples": 1242112, "global_step/max_steps": "4852/6362"} +{"lm loss": 4.85922623, "grad_norm": 0.35965988, "learning_rate": 1.717e-05, "elapsed_time_per_iteration": 6.54515696, "memory(GiB)": 21.51, "elapsed_time": "8h 51m 41s", "remaining_time": "2h 45m 19s", "loss_scale": 1.0, "consumed_samples": 1242368, "global_step/max_steps": "4853/6362"} +{"lm loss": 4.86788464, "grad_norm": 0.34047845, "learning_rate": 1.715e-05, "elapsed_time_per_iteration": 6.90827012, "memory(GiB)": 21.51, "elapsed_time": "8h 51m 48s", "remaining_time": "2h 45m 12s", "loss_scale": 1.0, "consumed_samples": 1242624, "global_step/max_steps": "4854/6362"} +{"lm loss": 4.90792704, "grad_norm": 0.39513031, "learning_rate": 1.713e-05, "elapsed_time_per_iteration": 6.60271645, "memory(GiB)": 21.51, "elapsed_time": "8h 51m 54s", "remaining_time": "2h 45m 6s", "loss_scale": 1.0, "consumed_samples": 1242880, "global_step/max_steps": "4855/6362"} +{"lm loss": 4.87492037, "grad_norm": 0.35154349, "learning_rate": 1.712e-05, "elapsed_time_per_iteration": 6.58045912, "memory(GiB)": 21.51, "elapsed_time": "8h 52m 1s", "remaining_time": "2h 44m 59s", "loss_scale": 1.0, "consumed_samples": 1243136, "global_step/max_steps": "4856/6362"} +{"lm loss": 4.89639425, "grad_norm": 0.38822672, "learning_rate": 1.71e-05, "elapsed_time_per_iteration": 6.69186783, "memory(GiB)": 21.51, "elapsed_time": "8h 52m 8s", "remaining_time": "2h 44m 53s", "loss_scale": 1.0, "consumed_samples": 1243392, "global_step/max_steps": "4857/6362"} +{"lm loss": 4.8694129, "grad_norm": 0.36776221, "learning_rate": 1.708e-05, "elapsed_time_per_iteration": 6.50541139, "memory(GiB)": 21.51, "elapsed_time": "8h 52m 14s", "remaining_time": "2h 44m 46s", "loss_scale": 1.0, "consumed_samples": 1243648, "global_step/max_steps": "4858/6362"} +{"lm loss": 4.9158926, "grad_norm": 0.3431254, "learning_rate": 1.706e-05, "elapsed_time_per_iteration": 6.80826116, "memory(GiB)": 21.51, "elapsed_time": "8h 52m 21s", "remaining_time": "2h 44m 40s", "loss_scale": 1.0, "consumed_samples": 1243904, "global_step/max_steps": "4859/6362"} +{"lm loss": 4.88337612, "grad_norm": 0.38037157, "learning_rate": 1.705e-05, "elapsed_time_per_iteration": 6.46251154, "memory(GiB)": 21.51, "elapsed_time": "8h 52m 27s", "remaining_time": "2h 44m 33s", "loss_scale": 1.0, "consumed_samples": 1244160, "global_step/max_steps": "4860/6362"} +{"lm loss": 4.90643311, "grad_norm": 0.32839799, "learning_rate": 1.703e-05, "elapsed_time_per_iteration": 6.47337985, "memory(GiB)": 21.51, "elapsed_time": "8h 52m 34s", "remaining_time": "2h 44m 26s", "loss_scale": 1.0, "consumed_samples": 1244416, "global_step/max_steps": "4861/6362"} +{"lm loss": 4.88343239, "grad_norm": 0.37646931, "learning_rate": 1.701e-05, "elapsed_time_per_iteration": 6.24470329, "memory(GiB)": 21.51, "elapsed_time": "8h 52m 40s", "remaining_time": "2h 44m 20s", "loss_scale": 1.0, "consumed_samples": 1244672, "global_step/max_steps": "4862/6362"} +{"lm loss": 4.89227057, "grad_norm": 0.36043268, "learning_rate": 1.699e-05, "elapsed_time_per_iteration": 6.59859562, "memory(GiB)": 21.51, "elapsed_time": "8h 52m 47s", "remaining_time": "2h 44m 13s", "loss_scale": 1.0, "consumed_samples": 1244928, "global_step/max_steps": "4863/6362"} +{"lm loss": 4.87582541, "grad_norm": 0.35473976, "learning_rate": 1.697e-05, "elapsed_time_per_iteration": 6.61414766, "memory(GiB)": 21.51, "elapsed_time": "8h 52m 53s", "remaining_time": "2h 44m 7s", "loss_scale": 1.0, "consumed_samples": 1245184, "global_step/max_steps": "4864/6362"} +{"lm loss": 4.88054514, "grad_norm": 0.36534774, "learning_rate": 1.696e-05, "elapsed_time_per_iteration": 6.44670177, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 0s", "remaining_time": "2h 44m 0s", "loss_scale": 1.0, "consumed_samples": 1245440, "global_step/max_steps": "4865/6362"} +{"lm loss": 4.878407, "grad_norm": 0.34175101, "learning_rate": 1.694e-05, "elapsed_time_per_iteration": 6.7041564, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 6s", "remaining_time": "2h 43m 54s", "loss_scale": 1.0, "consumed_samples": 1245696, "global_step/max_steps": "4866/6362"} +{"lm loss": 4.87468624, "grad_norm": 0.34700745, "learning_rate": 1.692e-05, "elapsed_time_per_iteration": 6.48842335, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 13s", "remaining_time": "2h 43m 47s", "loss_scale": 1.0, "consumed_samples": 1245952, "global_step/max_steps": "4867/6362"} +{"lm loss": 4.86834383, "grad_norm": 0.37883371, "learning_rate": 1.69e-05, "elapsed_time_per_iteration": 6.40333486, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 19s", "remaining_time": "2h 43m 40s", "loss_scale": 1.0, "consumed_samples": 1246208, "global_step/max_steps": "4868/6362"} +{"lm loss": 4.87556934, "grad_norm": 0.32949641, "learning_rate": 1.689e-05, "elapsed_time_per_iteration": 6.45701313, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 26s", "remaining_time": "2h 43m 34s", "loss_scale": 1.0, "consumed_samples": 1246464, "global_step/max_steps": "4869/6362"} +{"lm loss": 4.88065481, "grad_norm": 0.36693001, "learning_rate": 1.687e-05, "elapsed_time_per_iteration": 6.50109267, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 32s", "remaining_time": "2h 43m 27s", "loss_scale": 1.0, "consumed_samples": 1246720, "global_step/max_steps": "4870/6362"} +{"lm loss": 4.88984919, "grad_norm": 0.31755087, "learning_rate": 1.685e-05, "elapsed_time_per_iteration": 6.41700077, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 39s", "remaining_time": "2h 43m 20s", "loss_scale": 1.0, "consumed_samples": 1246976, "global_step/max_steps": "4871/6362"} +{"lm loss": 4.86963224, "grad_norm": 0.40565473, "learning_rate": 1.683e-05, "elapsed_time_per_iteration": 6.44330168, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 45s", "remaining_time": "2h 43m 14s", "loss_scale": 1.0, "consumed_samples": 1247232, "global_step/max_steps": "4872/6362"} +{"lm loss": 4.87639523, "grad_norm": 0.32384124, "learning_rate": 1.682e-05, "elapsed_time_per_iteration": 6.56118059, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 52s", "remaining_time": "2h 43m 7s", "loss_scale": 1.0, "consumed_samples": 1247488, "global_step/max_steps": "4873/6362"} +{"lm loss": 4.87099075, "grad_norm": 0.37741116, "learning_rate": 1.68e-05, "elapsed_time_per_iteration": 6.5102036, "memory(GiB)": 21.51, "elapsed_time": "8h 53m 58s", "remaining_time": "2h 43m 1s", "loss_scale": 1.0, "consumed_samples": 1247744, "global_step/max_steps": "4874/6362"} +{"lm loss": 4.88898993, "grad_norm": 0.35468146, "learning_rate": 1.678e-05, "elapsed_time_per_iteration": 6.35328221, "memory(GiB)": 21.51, "elapsed_time": "8h 54m 5s", "remaining_time": "2h 42m 54s", "loss_scale": 1.0, "consumed_samples": 1248000, "global_step/max_steps": "4875/6362"} +{"lm loss": 4.88062716, "grad_norm": 0.3699792, "learning_rate": 1.676e-05, "elapsed_time_per_iteration": 6.47877192, "memory(GiB)": 21.51, "elapsed_time": "8h 54m 11s", "remaining_time": "2h 42m 47s", "loss_scale": 1.0, "consumed_samples": 1248256, "global_step/max_steps": "4876/6362"} +{"lm loss": 4.87422562, "grad_norm": 0.33481586, "learning_rate": 1.675e-05, "elapsed_time_per_iteration": 6.50671482, "memory(GiB)": 21.51, "elapsed_time": "8h 54m 18s", "remaining_time": "2h 42m 41s", "loss_scale": 1.0, "consumed_samples": 1248512, "global_step/max_steps": "4877/6362"} +{"lm loss": 4.88651228, "grad_norm": 0.33909222, "learning_rate": 1.673e-05, "elapsed_time_per_iteration": 6.67344451, "memory(GiB)": 21.51, "elapsed_time": "8h 54m 24s", "remaining_time": "2h 42m 34s", "loss_scale": 1.0, "consumed_samples": 1248768, "global_step/max_steps": "4878/6362"} +{"lm loss": 4.86708736, "grad_norm": 0.32149163, "learning_rate": 1.671e-05, "elapsed_time_per_iteration": 6.56389451, "memory(GiB)": 21.51, "elapsed_time": "8h 54m 31s", "remaining_time": "2h 42m 28s", "loss_scale": 1.0, "consumed_samples": 1249024, "global_step/max_steps": "4879/6362"} +{"lm loss": 4.86740541, "grad_norm": 0.33156404, "learning_rate": 1.669e-05, "elapsed_time_per_iteration": 6.51025295, "memory(GiB)": 21.51, "elapsed_time": "8h 54m 37s", "remaining_time": "2h 42m 21s", "loss_scale": 1.0, "consumed_samples": 1249280, "global_step/max_steps": "4880/6362"} +{"lm loss": 4.8929925, "grad_norm": 0.34795982, "learning_rate": 1.668e-05, "elapsed_time_per_iteration": 6.50474048, "memory(GiB)": 21.51, "elapsed_time": "8h 54m 44s", "remaining_time": "2h 42m 15s", "loss_scale": 1.0, "consumed_samples": 1249536, "global_step/max_steps": "4881/6362"} +{"lm loss": 4.87118769, "grad_norm": 0.32324505, "learning_rate": 1.666e-05, "elapsed_time_per_iteration": 6.42500019, "memory(GiB)": 21.51, "elapsed_time": "8h 54m 50s", "remaining_time": "2h 42m 8s", "loss_scale": 1.0, "consumed_samples": 1249792, "global_step/max_steps": "4882/6362"} +{"lm loss": 4.86564207, "grad_norm": 0.3342348, "learning_rate": 1.664e-05, "elapsed_time_per_iteration": 6.41565442, "memory(GiB)": 21.51, "elapsed_time": "8h 54m 57s", "remaining_time": "2h 42m 1s", "loss_scale": 1.0, "consumed_samples": 1250048, "global_step/max_steps": "4883/6362"} +{"lm loss": 4.87550116, "grad_norm": 0.32232693, "learning_rate": 1.662e-05, "elapsed_time_per_iteration": 6.55413651, "memory(GiB)": 21.51, "elapsed_time": "8h 55m 3s", "remaining_time": "2h 41m 55s", "loss_scale": 1.0, "consumed_samples": 1250304, "global_step/max_steps": "4884/6362"} +{"lm loss": 4.85389519, "grad_norm": 0.37305388, "learning_rate": 1.661e-05, "elapsed_time_per_iteration": 6.57346821, "memory(GiB)": 21.51, "elapsed_time": "8h 55m 10s", "remaining_time": "2h 41m 48s", "loss_scale": 1.0, "consumed_samples": 1250560, "global_step/max_steps": "4885/6362"} +{"lm loss": 4.88503599, "grad_norm": 0.32602498, "learning_rate": 1.659e-05, "elapsed_time_per_iteration": 6.30245256, "memory(GiB)": 21.51, "elapsed_time": "8h 55m 16s", "remaining_time": "2h 41m 42s", "loss_scale": 1.0, "consumed_samples": 1250816, "global_step/max_steps": "4886/6362"} +{"lm loss": 4.87863398, "grad_norm": 0.34490952, "learning_rate": 1.657e-05, "elapsed_time_per_iteration": 6.41814733, "memory(GiB)": 21.51, "elapsed_time": "8h 55m 22s", "remaining_time": "2h 41m 35s", "loss_scale": 1.0, "consumed_samples": 1251072, "global_step/max_steps": "4887/6362"} +{"lm loss": 4.89315844, "grad_norm": 0.35838664, "learning_rate": 1.655e-05, "elapsed_time_per_iteration": 6.6679256, "memory(GiB)": 21.51, "elapsed_time": "8h 55m 29s", "remaining_time": "2h 41m 28s", "loss_scale": 1.0, "consumed_samples": 1251328, "global_step/max_steps": "4888/6362"} +{"lm loss": 4.87078714, "grad_norm": 0.32108894, "learning_rate": 1.654e-05, "elapsed_time_per_iteration": 6.54838181, "memory(GiB)": 21.51, "elapsed_time": "8h 55m 36s", "remaining_time": "2h 41m 22s", "loss_scale": 1.0, "consumed_samples": 1251584, "global_step/max_steps": "4889/6362"} +{"lm loss": 4.86036491, "grad_norm": 0.35767058, "learning_rate": 1.652e-05, "elapsed_time_per_iteration": 6.3451221, "memory(GiB)": 21.51, "elapsed_time": "8h 55m 42s", "remaining_time": "2h 41m 15s", "loss_scale": 1.0, "consumed_samples": 1251840, "global_step/max_steps": "4890/6362"} +{"lm loss": 4.86432171, "grad_norm": 0.31380132, "learning_rate": 1.65e-05, "elapsed_time_per_iteration": 6.24556565, "memory(GiB)": 21.51, "elapsed_time": "8h 55m 48s", "remaining_time": "2h 41m 8s", "loss_scale": 1.0, "consumed_samples": 1252096, "global_step/max_steps": "4891/6362"} +{"lm loss": 4.89416027, "grad_norm": 0.34516528, "learning_rate": 1.648e-05, "elapsed_time_per_iteration": 6.62818503, "memory(GiB)": 21.51, "elapsed_time": "8h 55m 55s", "remaining_time": "2h 41m 2s", "loss_scale": 1.0, "consumed_samples": 1252352, "global_step/max_steps": "4892/6362"} +{"lm loss": 4.88167524, "grad_norm": 0.3293297, "learning_rate": 1.647e-05, "elapsed_time_per_iteration": 6.61845279, "memory(GiB)": 21.51, "elapsed_time": "8h 56m 2s", "remaining_time": "2h 40m 55s", "loss_scale": 1.0, "consumed_samples": 1252608, "global_step/max_steps": "4893/6362"} +{"lm loss": 4.87894869, "grad_norm": 0.3167105, "learning_rate": 1.645e-05, "elapsed_time_per_iteration": 6.55764151, "memory(GiB)": 21.51, "elapsed_time": "8h 56m 8s", "remaining_time": "2h 40m 49s", "loss_scale": 1.0, "consumed_samples": 1252864, "global_step/max_steps": "4894/6362"} +{"lm loss": 4.85863829, "grad_norm": 0.35420558, "learning_rate": 1.643e-05, "elapsed_time_per_iteration": 6.49050331, "memory(GiB)": 21.51, "elapsed_time": "8h 56m 15s", "remaining_time": "2h 40m 42s", "loss_scale": 1.0, "consumed_samples": 1253120, "global_step/max_steps": "4895/6362"} +{"lm loss": 4.87391043, "grad_norm": 0.33455449, "learning_rate": 1.641e-05, "elapsed_time_per_iteration": 6.60603094, "memory(GiB)": 21.51, "elapsed_time": "8h 56m 21s", "remaining_time": "2h 40m 36s", "loss_scale": 1.0, "consumed_samples": 1253376, "global_step/max_steps": "4896/6362"} +{"lm loss": 4.8999567, "grad_norm": 0.35806379, "learning_rate": 1.64e-05, "elapsed_time_per_iteration": 6.635396, "memory(GiB)": 21.51, "elapsed_time": "8h 56m 28s", "remaining_time": "2h 40m 29s", "loss_scale": 1.0, "consumed_samples": 1253632, "global_step/max_steps": "4897/6362"} +{"lm loss": 4.85059023, "grad_norm": 0.34677774, "learning_rate": 1.638e-05, "elapsed_time_per_iteration": 6.60697436, "memory(GiB)": 21.51, "elapsed_time": "8h 56m 34s", "remaining_time": "2h 40m 22s", "loss_scale": 1.0, "consumed_samples": 1253888, "global_step/max_steps": "4898/6362"} +{"lm loss": 4.87737846, "grad_norm": 0.33120689, "learning_rate": 1.636e-05, "elapsed_time_per_iteration": 6.29421687, "memory(GiB)": 21.51, "elapsed_time": "8h 56m 41s", "remaining_time": "2h 40m 16s", "loss_scale": 1.0, "consumed_samples": 1254144, "global_step/max_steps": "4899/6362"} +{"lm loss": 4.885571, "grad_norm": 0.35436991, "learning_rate": 1.634e-05, "elapsed_time_per_iteration": 6.35891843, "memory(GiB)": 21.51, "elapsed_time": "8h 56m 47s", "remaining_time": "2h 40m 9s", "loss_scale": 1.0, "consumed_samples": 1254400, "global_step/max_steps": "4900/6362"} +{"lm loss": 4.86141539, "grad_norm": 0.32369003, "learning_rate": 1.633e-05, "elapsed_time_per_iteration": 6.69309926, "memory(GiB)": 21.51, "elapsed_time": "8h 56m 54s", "remaining_time": "2h 40m 3s", "loss_scale": 1.0, "consumed_samples": 1254656, "global_step/max_steps": "4901/6362"} +{"lm loss": 4.87275028, "grad_norm": 0.35846919, "learning_rate": 1.631e-05, "elapsed_time_per_iteration": 6.47289944, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 0s", "remaining_time": "2h 39m 56s", "loss_scale": 1.0, "consumed_samples": 1254912, "global_step/max_steps": "4902/6362"} +{"lm loss": 4.8747201, "grad_norm": 0.31449351, "learning_rate": 1.629e-05, "elapsed_time_per_iteration": 6.63257647, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 7s", "remaining_time": "2h 39m 49s", "loss_scale": 1.0, "consumed_samples": 1255168, "global_step/max_steps": "4903/6362"} +{"lm loss": 4.86139584, "grad_norm": 0.37212819, "learning_rate": 1.627e-05, "elapsed_time_per_iteration": 6.50397706, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 13s", "remaining_time": "2h 39m 43s", "loss_scale": 1.0, "consumed_samples": 1255424, "global_step/max_steps": "4904/6362"} +{"lm loss": 4.86848402, "grad_norm": 0.32662633, "learning_rate": 1.626e-05, "elapsed_time_per_iteration": 6.42716932, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 20s", "remaining_time": "2h 39m 36s", "loss_scale": 1.0, "consumed_samples": 1255680, "global_step/max_steps": "4905/6362"} +{"lm loss": 4.88171673, "grad_norm": 0.34107849, "learning_rate": 1.624e-05, "elapsed_time_per_iteration": 6.48351026, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 26s", "remaining_time": "2h 39m 30s", "loss_scale": 1.0, "consumed_samples": 1255936, "global_step/max_steps": "4906/6362"} +{"lm loss": 4.86840248, "grad_norm": 0.31930113, "learning_rate": 1.622e-05, "elapsed_time_per_iteration": 6.78816581, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 33s", "remaining_time": "2h 39m 23s", "loss_scale": 1.0, "consumed_samples": 1256192, "global_step/max_steps": "4907/6362"} +{"lm loss": 4.87539387, "grad_norm": 0.36512107, "learning_rate": 1.62e-05, "elapsed_time_per_iteration": 6.46841574, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 40s", "remaining_time": "2h 39m 17s", "loss_scale": 1.0, "consumed_samples": 1256448, "global_step/max_steps": "4908/6362"} +{"lm loss": 4.88332653, "grad_norm": 0.31727442, "learning_rate": 1.619e-05, "elapsed_time_per_iteration": 6.59587717, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 46s", "remaining_time": "2h 39m 10s", "loss_scale": 1.0, "consumed_samples": 1256704, "global_step/max_steps": "4909/6362"} +{"lm loss": 4.87776995, "grad_norm": 0.35903671, "learning_rate": 1.617e-05, "elapsed_time_per_iteration": 6.46037936, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 53s", "remaining_time": "2h 39m 3s", "loss_scale": 1.0, "consumed_samples": 1256960, "global_step/max_steps": "4910/6362"} +{"lm loss": 4.88248491, "grad_norm": 0.32931191, "learning_rate": 1.615e-05, "elapsed_time_per_iteration": 6.62930536, "memory(GiB)": 21.51, "elapsed_time": "8h 57m 59s", "remaining_time": "2h 38m 57s", "loss_scale": 1.0, "consumed_samples": 1257216, "global_step/max_steps": "4911/6362"} +{"lm loss": 4.88804436, "grad_norm": 0.34524184, "learning_rate": 1.614e-05, "elapsed_time_per_iteration": 6.7644043, "memory(GiB)": 21.51, "elapsed_time": "8h 58m 6s", "remaining_time": "2h 38m 50s", "loss_scale": 1.0, "consumed_samples": 1257472, "global_step/max_steps": "4912/6362"} +{"lm loss": 4.87300539, "grad_norm": 0.33383596, "learning_rate": 1.612e-05, "elapsed_time_per_iteration": 6.46961474, "memory(GiB)": 21.51, "elapsed_time": "8h 58m 12s", "remaining_time": "2h 38m 44s", "loss_scale": 1.0, "consumed_samples": 1257728, "global_step/max_steps": "4913/6362"} +{"lm loss": 4.86786032, "grad_norm": 0.3361094, "learning_rate": 1.61e-05, "elapsed_time_per_iteration": 6.64656496, "memory(GiB)": 21.51, "elapsed_time": "8h 58m 19s", "remaining_time": "2h 38m 37s", "loss_scale": 1.0, "consumed_samples": 1257984, "global_step/max_steps": "4914/6362"} +{"lm loss": 4.88155651, "grad_norm": 0.3433564, "learning_rate": 1.608e-05, "elapsed_time_per_iteration": 6.53807306, "memory(GiB)": 21.51, "elapsed_time": "8h 58m 26s", "remaining_time": "2h 38m 31s", "loss_scale": 1.0, "consumed_samples": 1258240, "global_step/max_steps": "4915/6362"} +{"lm loss": 4.8856945, "grad_norm": 0.32210314, "learning_rate": 1.607e-05, "elapsed_time_per_iteration": 6.57192206, "memory(GiB)": 21.51, "elapsed_time": "8h 58m 32s", "remaining_time": "2h 38m 24s", "loss_scale": 1.0, "consumed_samples": 1258496, "global_step/max_steps": "4916/6362"} +{"lm loss": 4.86582565, "grad_norm": 0.35929167, "learning_rate": 1.605e-05, "elapsed_time_per_iteration": 6.50660801, "memory(GiB)": 21.51, "elapsed_time": "8h 58m 39s", "remaining_time": "2h 38m 17s", "loss_scale": 1.0, "consumed_samples": 1258752, "global_step/max_steps": "4917/6362"} +{"lm loss": 4.85884571, "grad_norm": 0.33601782, "learning_rate": 1.603e-05, "elapsed_time_per_iteration": 6.73228931, "memory(GiB)": 21.51, "elapsed_time": "8h 58m 45s", "remaining_time": "2h 38m 11s", "loss_scale": 1.0, "consumed_samples": 1259008, "global_step/max_steps": "4918/6362"} +{"lm loss": 4.86634302, "grad_norm": 0.33702585, "learning_rate": 1.602e-05, "elapsed_time_per_iteration": 6.47705936, "memory(GiB)": 21.51, "elapsed_time": "8h 58m 52s", "remaining_time": "2h 38m 4s", "loss_scale": 1.0, "consumed_samples": 1259264, "global_step/max_steps": "4919/6362"} +{"lm loss": 4.85051441, "grad_norm": 0.32877332, "learning_rate": 1.6e-05, "elapsed_time_per_iteration": 6.61745429, "memory(GiB)": 21.51, "elapsed_time": "8h 58m 59s", "remaining_time": "2h 37m 58s", "loss_scale": 1.0, "consumed_samples": 1259520, "global_step/max_steps": "4920/6362"} +{"lm loss": 4.86290646, "grad_norm": 0.34972107, "learning_rate": 1.598e-05, "elapsed_time_per_iteration": 6.51302195, "memory(GiB)": 21.51, "elapsed_time": "8h 59m 5s", "remaining_time": "2h 37m 51s", "loss_scale": 1.0, "consumed_samples": 1259776, "global_step/max_steps": "4921/6362"} +{"lm loss": 4.88554621, "grad_norm": 0.34186184, "learning_rate": 1.596e-05, "elapsed_time_per_iteration": 6.77011943, "memory(GiB)": 21.51, "elapsed_time": "8h 59m 12s", "remaining_time": "2h 37m 45s", "loss_scale": 1.0, "consumed_samples": 1260032, "global_step/max_steps": "4922/6362"} +{"lm loss": 4.88149452, "grad_norm": 0.35185096, "learning_rate": 1.595e-05, "elapsed_time_per_iteration": 6.68524432, "memory(GiB)": 21.51, "elapsed_time": "8h 59m 19s", "remaining_time": "2h 37m 38s", "loss_scale": 1.0, "consumed_samples": 1260288, "global_step/max_steps": "4923/6362"} +{"lm loss": 4.87236357, "grad_norm": 0.35233372, "learning_rate": 1.593e-05, "elapsed_time_per_iteration": 6.5819726, "memory(GiB)": 21.51, "elapsed_time": "8h 59m 25s", "remaining_time": "2h 37m 32s", "loss_scale": 1.0, "consumed_samples": 1260544, "global_step/max_steps": "4924/6362"} +{"lm loss": 4.86155367, "grad_norm": 0.36491311, "learning_rate": 1.591e-05, "elapsed_time_per_iteration": 6.57309842, "memory(GiB)": 21.51, "elapsed_time": "8h 59m 32s", "remaining_time": "2h 37m 25s", "loss_scale": 1.0, "consumed_samples": 1260800, "global_step/max_steps": "4925/6362"} +{"lm loss": 4.89777946, "grad_norm": 0.33829716, "learning_rate": 1.59e-05, "elapsed_time_per_iteration": 6.52069259, "memory(GiB)": 21.51, "elapsed_time": "8h 59m 38s", "remaining_time": "2h 37m 18s", "loss_scale": 1.0, "consumed_samples": 1261056, "global_step/max_steps": "4926/6362"} +{"lm loss": 4.87658691, "grad_norm": 0.33495709, "learning_rate": 1.588e-05, "elapsed_time_per_iteration": 6.49138427, "memory(GiB)": 21.51, "elapsed_time": "8h 59m 45s", "remaining_time": "2h 37m 12s", "loss_scale": 1.0, "consumed_samples": 1261312, "global_step/max_steps": "4927/6362"} +{"lm loss": 4.86408615, "grad_norm": 0.34551021, "learning_rate": 1.586e-05, "elapsed_time_per_iteration": 6.34731221, "memory(GiB)": 21.51, "elapsed_time": "8h 59m 51s", "remaining_time": "2h 37m 5s", "loss_scale": 1.0, "consumed_samples": 1261568, "global_step/max_steps": "4928/6362"} +{"lm loss": 4.88109255, "grad_norm": 0.36496019, "learning_rate": 1.584e-05, "elapsed_time_per_iteration": 6.57474518, "memory(GiB)": 21.51, "elapsed_time": "8h 59m 58s", "remaining_time": "2h 36m 59s", "loss_scale": 1.0, "consumed_samples": 1261824, "global_step/max_steps": "4929/6362"} +{"lm loss": 4.87632179, "grad_norm": 0.33835247, "learning_rate": 1.583e-05, "elapsed_time_per_iteration": 6.51512885, "memory(GiB)": 21.51, "elapsed_time": "9h 0m 4s", "remaining_time": "2h 36m 52s", "loss_scale": 1.0, "consumed_samples": 1262080, "global_step/max_steps": "4930/6362"} +{"lm loss": 4.86967516, "grad_norm": 0.36043158, "learning_rate": 1.581e-05, "elapsed_time_per_iteration": 6.7666533, "memory(GiB)": 21.51, "elapsed_time": "9h 0m 11s", "remaining_time": "2h 36m 45s", "loss_scale": 1.0, "consumed_samples": 1262336, "global_step/max_steps": "4931/6362"} +{"lm loss": 4.87495852, "grad_norm": 0.35020959, "learning_rate": 1.579e-05, "elapsed_time_per_iteration": 6.8783114, "memory(GiB)": 21.51, "elapsed_time": "9h 0m 18s", "remaining_time": "2h 36m 39s", "loss_scale": 1.0, "consumed_samples": 1262592, "global_step/max_steps": "4932/6362"} +{"lm loss": 4.90488386, "grad_norm": 0.37794748, "learning_rate": 1.578e-05, "elapsed_time_per_iteration": 6.74870586, "memory(GiB)": 21.51, "elapsed_time": "9h 0m 25s", "remaining_time": "2h 36m 32s", "loss_scale": 1.0, "consumed_samples": 1262848, "global_step/max_steps": "4933/6362"} +{"lm loss": 4.8825779, "grad_norm": 0.34982827, "learning_rate": 1.576e-05, "elapsed_time_per_iteration": 6.44355869, "memory(GiB)": 21.51, "elapsed_time": "9h 0m 31s", "remaining_time": "2h 36m 26s", "loss_scale": 1.0, "consumed_samples": 1263104, "global_step/max_steps": "4934/6362"} +{"lm loss": 4.883883, "grad_norm": 0.3119669, "learning_rate": 1.574e-05, "elapsed_time_per_iteration": 6.47411942, "memory(GiB)": 21.51, "elapsed_time": "9h 0m 37s", "remaining_time": "2h 36m 19s", "loss_scale": 1.0, "consumed_samples": 1263360, "global_step/max_steps": "4935/6362"} +{"lm loss": 4.85264254, "grad_norm": 0.34149542, "learning_rate": 1.572e-05, "elapsed_time_per_iteration": 6.63818979, "memory(GiB)": 21.51, "elapsed_time": "9h 0m 44s", "remaining_time": "2h 36m 13s", "loss_scale": 1.0, "consumed_samples": 1263616, "global_step/max_steps": "4936/6362"} +{"lm loss": 4.87020588, "grad_norm": 0.36101067, "learning_rate": 1.571e-05, "elapsed_time_per_iteration": 6.75613809, "memory(GiB)": 21.51, "elapsed_time": "9h 0m 51s", "remaining_time": "2h 36m 6s", "loss_scale": 1.0, "consumed_samples": 1263872, "global_step/max_steps": "4937/6362"} +{"lm loss": 4.86273432, "grad_norm": 0.33249015, "learning_rate": 1.569e-05, "elapsed_time_per_iteration": 6.56920314, "memory(GiB)": 21.51, "elapsed_time": "9h 0m 57s", "remaining_time": "2h 36m 0s", "loss_scale": 1.0, "consumed_samples": 1264128, "global_step/max_steps": "4938/6362"} +{"lm loss": 4.86081505, "grad_norm": 0.3409763, "learning_rate": 1.567e-05, "elapsed_time_per_iteration": 6.56082559, "memory(GiB)": 21.51, "elapsed_time": "9h 1m 4s", "remaining_time": "2h 35m 53s", "loss_scale": 1.0, "consumed_samples": 1264384, "global_step/max_steps": "4939/6362"} +{"lm loss": 4.88028336, "grad_norm": 0.32994512, "learning_rate": 1.566e-05, "elapsed_time_per_iteration": 6.55867958, "memory(GiB)": 21.51, "elapsed_time": "9h 1m 11s", "remaining_time": "2h 35m 46s", "loss_scale": 1.0, "consumed_samples": 1264640, "global_step/max_steps": "4940/6362"} +{"lm loss": 4.85147476, "grad_norm": 0.35646728, "learning_rate": 1.564e-05, "elapsed_time_per_iteration": 6.76056147, "memory(GiB)": 21.51, "elapsed_time": "9h 1m 17s", "remaining_time": "2h 35m 40s", "loss_scale": 1.0, "consumed_samples": 1264896, "global_step/max_steps": "4941/6362"} +{"lm loss": 4.88222504, "grad_norm": 0.38596022, "learning_rate": 1.562e-05, "elapsed_time_per_iteration": 6.50996304, "memory(GiB)": 21.51, "elapsed_time": "9h 1m 24s", "remaining_time": "2h 35m 33s", "loss_scale": 1.0, "consumed_samples": 1265152, "global_step/max_steps": "4942/6362"} +{"lm loss": 4.90742159, "grad_norm": 0.35876146, "learning_rate": 1.561e-05, "elapsed_time_per_iteration": 6.42226911, "memory(GiB)": 21.51, "elapsed_time": "9h 1m 30s", "remaining_time": "2h 35m 27s", "loss_scale": 1.0, "consumed_samples": 1265408, "global_step/max_steps": "4943/6362"} +{"lm loss": 4.88308144, "grad_norm": 0.36089242, "learning_rate": 1.559e-05, "elapsed_time_per_iteration": 6.61346555, "memory(GiB)": 21.51, "elapsed_time": "9h 1m 37s", "remaining_time": "2h 35m 20s", "loss_scale": 1.0, "consumed_samples": 1265664, "global_step/max_steps": "4944/6362"} +{"lm loss": 4.88186026, "grad_norm": 0.34604043, "learning_rate": 1.557e-05, "elapsed_time_per_iteration": 6.72361159, "memory(GiB)": 21.51, "elapsed_time": "9h 1m 44s", "remaining_time": "2h 35m 14s", "loss_scale": 1.0, "consumed_samples": 1265920, "global_step/max_steps": "4945/6362"} +{"lm loss": 4.85386229, "grad_norm": 0.36300549, "learning_rate": 1.555e-05, "elapsed_time_per_iteration": 6.71980429, "memory(GiB)": 21.51, "elapsed_time": "9h 1m 50s", "remaining_time": "2h 35m 7s", "loss_scale": 1.0, "consumed_samples": 1266176, "global_step/max_steps": "4946/6362"} +{"lm loss": 4.88712597, "grad_norm": 0.35524994, "learning_rate": 1.554e-05, "elapsed_time_per_iteration": 6.56605887, "memory(GiB)": 21.51, "elapsed_time": "9h 1m 57s", "remaining_time": "2h 35m 1s", "loss_scale": 1.0, "consumed_samples": 1266432, "global_step/max_steps": "4947/6362"} +{"lm loss": 4.87347555, "grad_norm": 0.36646965, "learning_rate": 1.552e-05, "elapsed_time_per_iteration": 6.4908278, "memory(GiB)": 21.51, "elapsed_time": "9h 2m 3s", "remaining_time": "2h 34m 54s", "loss_scale": 1.0, "consumed_samples": 1266688, "global_step/max_steps": "4948/6362"} +{"lm loss": 4.86275482, "grad_norm": 0.32976812, "learning_rate": 1.55e-05, "elapsed_time_per_iteration": 6.34801674, "memory(GiB)": 21.51, "elapsed_time": "9h 2m 10s", "remaining_time": "2h 34m 47s", "loss_scale": 1.0, "consumed_samples": 1266944, "global_step/max_steps": "4949/6362"} +{"lm loss": 4.88171482, "grad_norm": 0.34649095, "learning_rate": 1.549e-05, "elapsed_time_per_iteration": 6.49584937, "memory(GiB)": 21.51, "elapsed_time": "9h 2m 16s", "remaining_time": "2h 34m 41s", "loss_scale": 1.0, "consumed_samples": 1267200, "global_step/max_steps": "4950/6362"} +{"lm loss": 4.8556962, "grad_norm": 0.33612302, "learning_rate": 1.547e-05, "elapsed_time_per_iteration": 6.57798839, "memory(GiB)": 21.51, "elapsed_time": "9h 2m 23s", "remaining_time": "2h 34m 34s", "loss_scale": 1.0, "consumed_samples": 1267456, "global_step/max_steps": "4951/6362"} +{"lm loss": 4.88695812, "grad_norm": 0.31868649, "learning_rate": 1.545e-05, "elapsed_time_per_iteration": 6.47891164, "memory(GiB)": 21.51, "elapsed_time": "9h 2m 29s", "remaining_time": "2h 34m 28s", "loss_scale": 1.0, "consumed_samples": 1267712, "global_step/max_steps": "4952/6362"} +{"lm loss": 4.88844299, "grad_norm": 0.3344323, "learning_rate": 1.544e-05, "elapsed_time_per_iteration": 6.7322731, "memory(GiB)": 21.51, "elapsed_time": "9h 2m 36s", "remaining_time": "2h 34m 21s", "loss_scale": 1.0, "consumed_samples": 1267968, "global_step/max_steps": "4953/6362"} +{"lm loss": 4.85396004, "grad_norm": 0.3095299, "learning_rate": 1.542e-05, "elapsed_time_per_iteration": 6.58352184, "memory(GiB)": 21.51, "elapsed_time": "9h 2m 43s", "remaining_time": "2h 34m 14s", "loss_scale": 1.0, "consumed_samples": 1268224, "global_step/max_steps": "4954/6362"} +{"lm loss": 4.88832998, "grad_norm": 0.33657026, "learning_rate": 1.54e-05, "elapsed_time_per_iteration": 6.59725451, "memory(GiB)": 21.51, "elapsed_time": "9h 2m 49s", "remaining_time": "2h 34m 8s", "loss_scale": 1.0, "consumed_samples": 1268480, "global_step/max_steps": "4955/6362"} +{"lm loss": 4.88794565, "grad_norm": 0.33323482, "learning_rate": 1.539e-05, "elapsed_time_per_iteration": 6.46166468, "memory(GiB)": 21.51, "elapsed_time": "9h 2m 56s", "remaining_time": "2h 34m 1s", "loss_scale": 1.0, "consumed_samples": 1268736, "global_step/max_steps": "4956/6362"} +{"lm loss": 4.88093996, "grad_norm": 0.32104999, "learning_rate": 1.537e-05, "elapsed_time_per_iteration": 6.59123111, "memory(GiB)": 21.51, "elapsed_time": "9h 3m 2s", "remaining_time": "2h 33m 55s", "loss_scale": 1.0, "consumed_samples": 1268992, "global_step/max_steps": "4957/6362"} +{"lm loss": 4.85277033, "grad_norm": 0.34368894, "learning_rate": 1.535e-05, "elapsed_time_per_iteration": 6.4936657, "memory(GiB)": 21.51, "elapsed_time": "9h 3m 9s", "remaining_time": "2h 33m 48s", "loss_scale": 1.0, "consumed_samples": 1269248, "global_step/max_steps": "4958/6362"} +{"lm loss": 4.88228369, "grad_norm": 0.31336871, "learning_rate": 1.534e-05, "elapsed_time_per_iteration": 6.47874498, "memory(GiB)": 21.51, "elapsed_time": "9h 3m 15s", "remaining_time": "2h 33m 41s", "loss_scale": 1.0, "consumed_samples": 1269504, "global_step/max_steps": "4959/6362"} +{"lm loss": 4.88676882, "grad_norm": 0.35193893, "learning_rate": 1.532e-05, "elapsed_time_per_iteration": 6.56075025, "memory(GiB)": 21.51, "elapsed_time": "9h 3m 22s", "remaining_time": "2h 33m 35s", "loss_scale": 1.0, "consumed_samples": 1269760, "global_step/max_steps": "4960/6362"} +{"lm loss": 4.89318657, "grad_norm": 0.32138717, "learning_rate": 1.53e-05, "elapsed_time_per_iteration": 6.42183733, "memory(GiB)": 21.51, "elapsed_time": "9h 3m 28s", "remaining_time": "2h 33m 28s", "loss_scale": 1.0, "consumed_samples": 1270016, "global_step/max_steps": "4961/6362"} +{"lm loss": 4.88787508, "grad_norm": 0.35634914, "learning_rate": 1.529e-05, "elapsed_time_per_iteration": 6.47366261, "memory(GiB)": 21.51, "elapsed_time": "9h 3m 35s", "remaining_time": "2h 33m 22s", "loss_scale": 1.0, "consumed_samples": 1270272, "global_step/max_steps": "4962/6362"} +{"lm loss": 4.87732649, "grad_norm": 0.32556364, "learning_rate": 1.527e-05, "elapsed_time_per_iteration": 6.33940053, "memory(GiB)": 21.51, "elapsed_time": "9h 3m 41s", "remaining_time": "2h 33m 15s", "loss_scale": 1.0, "consumed_samples": 1270528, "global_step/max_steps": "4963/6362"} +{"lm loss": 4.88209677, "grad_norm": 0.33658481, "learning_rate": 1.525e-05, "elapsed_time_per_iteration": 6.34237623, "memory(GiB)": 21.51, "elapsed_time": "9h 3m 47s", "remaining_time": "2h 33m 8s", "loss_scale": 1.0, "consumed_samples": 1270784, "global_step/max_steps": "4964/6362"} +{"lm loss": 4.8989253, "grad_norm": 0.33115196, "learning_rate": 1.523e-05, "elapsed_time_per_iteration": 6.43318343, "memory(GiB)": 21.51, "elapsed_time": "9h 3m 54s", "remaining_time": "2h 33m 2s", "loss_scale": 1.0, "consumed_samples": 1271040, "global_step/max_steps": "4965/6362"} +{"lm loss": 4.89511395, "grad_norm": 0.31220922, "learning_rate": 1.522e-05, "elapsed_time_per_iteration": 6.42630863, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 0s", "remaining_time": "2h 32m 55s", "loss_scale": 1.0, "consumed_samples": 1271296, "global_step/max_steps": "4966/6362"} +{"lm loss": 4.86176014, "grad_norm": 0.35489807, "learning_rate": 1.52e-05, "elapsed_time_per_iteration": 6.38141608, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 7s", "remaining_time": "2h 32m 49s", "loss_scale": 1.0, "consumed_samples": 1271552, "global_step/max_steps": "4967/6362"} +{"lm loss": 4.88307095, "grad_norm": 0.31418496, "learning_rate": 1.518e-05, "elapsed_time_per_iteration": 6.37909245, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 13s", "remaining_time": "2h 32m 42s", "loss_scale": 1.0, "consumed_samples": 1271808, "global_step/max_steps": "4968/6362"} +{"lm loss": 4.8649559, "grad_norm": 0.35251746, "learning_rate": 1.517e-05, "elapsed_time_per_iteration": 6.64095211, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 20s", "remaining_time": "2h 32m 35s", "loss_scale": 1.0, "consumed_samples": 1272064, "global_step/max_steps": "4969/6362"} +{"lm loss": 4.87314653, "grad_norm": 0.35484087, "learning_rate": 1.515e-05, "elapsed_time_per_iteration": 6.5827353, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 26s", "remaining_time": "2h 32m 29s", "loss_scale": 1.0, "consumed_samples": 1272320, "global_step/max_steps": "4970/6362"} +{"lm loss": 4.90799856, "grad_norm": 0.30798611, "learning_rate": 1.513e-05, "elapsed_time_per_iteration": 6.53363228, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 33s", "remaining_time": "2h 32m 22s", "loss_scale": 1.0, "consumed_samples": 1272576, "global_step/max_steps": "4971/6362"} +{"lm loss": 4.90018845, "grad_norm": 0.32041034, "learning_rate": 1.512e-05, "elapsed_time_per_iteration": 6.5300374, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 39s", "remaining_time": "2h 32m 16s", "loss_scale": 1.0, "consumed_samples": 1272832, "global_step/max_steps": "4972/6362"} +{"lm loss": 4.87049532, "grad_norm": 0.31618544, "learning_rate": 1.51e-05, "elapsed_time_per_iteration": 6.52236962, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 46s", "remaining_time": "2h 32m 9s", "loss_scale": 1.0, "consumed_samples": 1273088, "global_step/max_steps": "4973/6362"} +{"lm loss": 4.86504936, "grad_norm": 0.32278845, "learning_rate": 1.508e-05, "elapsed_time_per_iteration": 6.59636831, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 52s", "remaining_time": "2h 32m 2s", "loss_scale": 1.0, "consumed_samples": 1273344, "global_step/max_steps": "4974/6362"} +{"lm loss": 4.88830519, "grad_norm": 0.32132077, "learning_rate": 1.507e-05, "elapsed_time_per_iteration": 6.62174463, "memory(GiB)": 21.51, "elapsed_time": "9h 4m 59s", "remaining_time": "2h 31m 56s", "loss_scale": 1.0, "consumed_samples": 1273600, "global_step/max_steps": "4975/6362"} +{"lm loss": 4.8814888, "grad_norm": 0.32150349, "learning_rate": 1.505e-05, "elapsed_time_per_iteration": 6.6337769, "memory(GiB)": 21.51, "elapsed_time": "9h 5m 6s", "remaining_time": "2h 31m 49s", "loss_scale": 1.0, "consumed_samples": 1273856, "global_step/max_steps": "4976/6362"} +{"lm loss": 4.88052511, "grad_norm": 0.32983154, "learning_rate": 1.503e-05, "elapsed_time_per_iteration": 6.52527046, "memory(GiB)": 21.51, "elapsed_time": "9h 5m 12s", "remaining_time": "2h 31m 43s", "loss_scale": 1.0, "consumed_samples": 1274112, "global_step/max_steps": "4977/6362"} +{"lm loss": 4.88357306, "grad_norm": 0.31171986, "learning_rate": 1.502e-05, "elapsed_time_per_iteration": 6.43345189, "memory(GiB)": 21.51, "elapsed_time": "9h 5m 19s", "remaining_time": "2h 31m 36s", "loss_scale": 1.0, "consumed_samples": 1274368, "global_step/max_steps": "4978/6362"} +{"lm loss": 4.90279293, "grad_norm": 0.32719311, "learning_rate": 1.5e-05, "elapsed_time_per_iteration": 6.53926873, "memory(GiB)": 21.51, "elapsed_time": "9h 5m 25s", "remaining_time": "2h 31m 30s", "loss_scale": 1.0, "consumed_samples": 1274624, "global_step/max_steps": "4979/6362"} +{"lm loss": 4.8720293, "grad_norm": 0.33544439, "learning_rate": 1.498e-05, "elapsed_time_per_iteration": 6.57664084, "memory(GiB)": 21.51, "elapsed_time": "9h 5m 32s", "remaining_time": "2h 31m 23s", "loss_scale": 1.0, "consumed_samples": 1274880, "global_step/max_steps": "4980/6362"} +{"lm loss": 4.87549353, "grad_norm": 0.3379989, "learning_rate": 1.497e-05, "elapsed_time_per_iteration": 6.5492537, "memory(GiB)": 21.51, "elapsed_time": "9h 5m 38s", "remaining_time": "2h 31m 16s", "loss_scale": 1.0, "consumed_samples": 1275136, "global_step/max_steps": "4981/6362"} +{"lm loss": 4.88166618, "grad_norm": 0.33963028, "learning_rate": 1.495e-05, "elapsed_time_per_iteration": 6.43365598, "memory(GiB)": 21.51, "elapsed_time": "9h 5m 45s", "remaining_time": "2h 31m 10s", "loss_scale": 1.0, "consumed_samples": 1275392, "global_step/max_steps": "4982/6362"} +{"lm loss": 4.88416052, "grad_norm": 0.32286823, "learning_rate": 1.494e-05, "elapsed_time_per_iteration": 6.36295033, "memory(GiB)": 21.51, "elapsed_time": "9h 5m 51s", "remaining_time": "2h 31m 3s", "loss_scale": 1.0, "consumed_samples": 1275648, "global_step/max_steps": "4983/6362"} +{"lm loss": 4.88383579, "grad_norm": 0.34418672, "learning_rate": 1.492e-05, "elapsed_time_per_iteration": 6.24943614, "memory(GiB)": 21.51, "elapsed_time": "9h 5m 57s", "remaining_time": "2h 30m 57s", "loss_scale": 1.0, "consumed_samples": 1275904, "global_step/max_steps": "4984/6362"} +{"lm loss": 4.88647652, "grad_norm": 0.32003251, "learning_rate": 1.49e-05, "elapsed_time_per_iteration": 6.67282629, "memory(GiB)": 21.51, "elapsed_time": "9h 6m 4s", "remaining_time": "2h 30m 50s", "loss_scale": 1.0, "consumed_samples": 1276160, "global_step/max_steps": "4985/6362"} +{"lm loss": 4.8862977, "grad_norm": 0.32250088, "learning_rate": 1.489e-05, "elapsed_time_per_iteration": 6.50619268, "memory(GiB)": 21.51, "elapsed_time": "9h 6m 10s", "remaining_time": "2h 30m 43s", "loss_scale": 1.0, "consumed_samples": 1276416, "global_step/max_steps": "4986/6362"} +{"lm loss": 4.90915251, "grad_norm": 0.34551027, "learning_rate": 1.487e-05, "elapsed_time_per_iteration": 6.41968226, "memory(GiB)": 21.51, "elapsed_time": "9h 6m 17s", "remaining_time": "2h 30m 37s", "loss_scale": 1.0, "consumed_samples": 1276672, "global_step/max_steps": "4987/6362"} +{"lm loss": 4.89369059, "grad_norm": 0.33898774, "learning_rate": 1.485e-05, "elapsed_time_per_iteration": 6.43973875, "memory(GiB)": 21.51, "elapsed_time": "9h 6m 23s", "remaining_time": "2h 30m 30s", "loss_scale": 1.0, "consumed_samples": 1276928, "global_step/max_steps": "4988/6362"} +{"lm loss": 4.86881018, "grad_norm": 0.34354496, "learning_rate": 1.484e-05, "elapsed_time_per_iteration": 6.63085699, "memory(GiB)": 21.51, "elapsed_time": "9h 6m 30s", "remaining_time": "2h 30m 24s", "loss_scale": 1.0, "consumed_samples": 1277184, "global_step/max_steps": "4989/6362"} +{"lm loss": 4.86998606, "grad_norm": 0.34076011, "learning_rate": 1.482e-05, "elapsed_time_per_iteration": 6.62312341, "memory(GiB)": 21.51, "elapsed_time": "9h 6m 37s", "remaining_time": "2h 30m 17s", "loss_scale": 1.0, "consumed_samples": 1277440, "global_step/max_steps": "4990/6362"} +{"lm loss": 4.89403296, "grad_norm": 0.36715466, "learning_rate": 1.48e-05, "elapsed_time_per_iteration": 6.63074589, "memory(GiB)": 21.51, "elapsed_time": "9h 6m 43s", "remaining_time": "2h 30m 10s", "loss_scale": 1.0, "consumed_samples": 1277696, "global_step/max_steps": "4991/6362"} +{"lm loss": 4.86225319, "grad_norm": 0.32009208, "learning_rate": 1.479e-05, "elapsed_time_per_iteration": 6.49349523, "memory(GiB)": 21.51, "elapsed_time": "9h 6m 50s", "remaining_time": "2h 30m 4s", "loss_scale": 1.0, "consumed_samples": 1277952, "global_step/max_steps": "4992/6362"} +{"lm loss": 4.90285778, "grad_norm": 0.34046265, "learning_rate": 1.477e-05, "elapsed_time_per_iteration": 6.83373404, "memory(GiB)": 21.51, "elapsed_time": "9h 6m 57s", "remaining_time": "2h 29m 57s", "loss_scale": 1.0, "consumed_samples": 1278208, "global_step/max_steps": "4993/6362"} +{"lm loss": 4.88222742, "grad_norm": 0.34247386, "learning_rate": 1.475e-05, "elapsed_time_per_iteration": 6.64723945, "memory(GiB)": 21.51, "elapsed_time": "9h 7m 3s", "remaining_time": "2h 29m 51s", "loss_scale": 1.0, "consumed_samples": 1278464, "global_step/max_steps": "4994/6362"} +{"lm loss": 4.87113047, "grad_norm": 0.35458553, "learning_rate": 1.474e-05, "elapsed_time_per_iteration": 6.55719638, "memory(GiB)": 21.51, "elapsed_time": "9h 7m 10s", "remaining_time": "2h 29m 44s", "loss_scale": 1.0, "consumed_samples": 1278720, "global_step/max_steps": "4995/6362"} +{"lm loss": 4.87435961, "grad_norm": 0.35406724, "learning_rate": 1.472e-05, "elapsed_time_per_iteration": 6.5668931, "memory(GiB)": 21.51, "elapsed_time": "9h 7m 16s", "remaining_time": "2h 29m 38s", "loss_scale": 1.0, "consumed_samples": 1278976, "global_step/max_steps": "4996/6362"} +{"lm loss": 4.86637402, "grad_norm": 0.343685, "learning_rate": 1.47e-05, "elapsed_time_per_iteration": 6.71971297, "memory(GiB)": 21.51, "elapsed_time": "9h 7m 23s", "remaining_time": "2h 29m 31s", "loss_scale": 1.0, "consumed_samples": 1279232, "global_step/max_steps": "4997/6362"} +{"lm loss": 4.85913134, "grad_norm": 0.35286546, "learning_rate": 1.469e-05, "elapsed_time_per_iteration": 6.61578751, "memory(GiB)": 21.51, "elapsed_time": "9h 7m 30s", "remaining_time": "2h 29m 25s", "loss_scale": 1.0, "consumed_samples": 1279488, "global_step/max_steps": "4998/6362"} +{"lm loss": 4.84644985, "grad_norm": 0.34979632, "learning_rate": 1.467e-05, "elapsed_time_per_iteration": 6.4853096, "memory(GiB)": 21.51, "elapsed_time": "9h 7m 36s", "remaining_time": "2h 29m 18s", "loss_scale": 1.0, "consumed_samples": 1279744, "global_step/max_steps": "4999/6362"} +{"lm loss": 4.87473822, "grad_norm": 0.33575076, "learning_rate": 1.466e-05, "elapsed_time_per_iteration": 6.58573818, "memory(GiB)": 21.51, "elapsed_time": "9h 7m 43s", "remaining_time": "2h 29m 11s", "loss_scale": 1.0, "consumed_samples": 1280000, "global_step/max_steps": "5000/6362"} +{"lm loss": 4.87334061, "grad_norm": 0.34466863, "learning_rate": 1.464e-05, "elapsed_time_per_iteration": 6.5109241, "memory(GiB)": 21.51, "elapsed_time": "9h 7m 49s", "remaining_time": "2h 29m 5s", "loss_scale": 1.0, "consumed_samples": 1280256, "global_step/max_steps": "5001/6362"} +{"lm loss": 4.86324883, "grad_norm": 0.33970341, "learning_rate": 1.462e-05, "elapsed_time_per_iteration": 6.70392489, "memory(GiB)": 21.51, "elapsed_time": "9h 7m 56s", "remaining_time": "2h 28m 58s", "loss_scale": 1.0, "consumed_samples": 1280512, "global_step/max_steps": "5002/6362"} +{"lm loss": 4.87558174, "grad_norm": 0.33508044, "learning_rate": 1.461e-05, "elapsed_time_per_iteration": 6.56397438, "memory(GiB)": 21.51, "elapsed_time": "9h 8m 2s", "remaining_time": "2h 28m 52s", "loss_scale": 1.0, "consumed_samples": 1280768, "global_step/max_steps": "5003/6362"} +{"lm loss": 4.86149406, "grad_norm": 0.35057414, "learning_rate": 1.459e-05, "elapsed_time_per_iteration": 6.67925978, "memory(GiB)": 21.51, "elapsed_time": "9h 8m 9s", "remaining_time": "2h 28m 45s", "loss_scale": 1.0, "consumed_samples": 1281024, "global_step/max_steps": "5004/6362"} +{"lm loss": 4.88663912, "grad_norm": 0.33394393, "learning_rate": 1.457e-05, "elapsed_time_per_iteration": 6.72873592, "memory(GiB)": 21.51, "elapsed_time": "9h 8m 16s", "remaining_time": "2h 28m 39s", "loss_scale": 1.0, "consumed_samples": 1281280, "global_step/max_steps": "5005/6362"} +{"lm loss": 4.87463093, "grad_norm": 0.333891, "learning_rate": 1.456e-05, "elapsed_time_per_iteration": 6.69148946, "memory(GiB)": 21.51, "elapsed_time": "9h 8m 23s", "remaining_time": "2h 28m 32s", "loss_scale": 1.0, "consumed_samples": 1281536, "global_step/max_steps": "5006/6362"} +{"lm loss": 4.85528708, "grad_norm": 0.36489022, "learning_rate": 1.454e-05, "elapsed_time_per_iteration": 6.67097211, "memory(GiB)": 21.51, "elapsed_time": "9h 8m 29s", "remaining_time": "2h 28m 26s", "loss_scale": 1.0, "consumed_samples": 1281792, "global_step/max_steps": "5007/6362"} +{"lm loss": 4.85387707, "grad_norm": 0.32911134, "learning_rate": 1.452e-05, "elapsed_time_per_iteration": 6.62322593, "memory(GiB)": 21.51, "elapsed_time": "9h 8m 36s", "remaining_time": "2h 28m 19s", "loss_scale": 1.0, "consumed_samples": 1282048, "global_step/max_steps": "5008/6362"} +{"lm loss": 4.87306023, "grad_norm": 0.3143619, "learning_rate": 1.451e-05, "elapsed_time_per_iteration": 6.74788833, "memory(GiB)": 21.51, "elapsed_time": "9h 8m 43s", "remaining_time": "2h 28m 12s", "loss_scale": 1.0, "consumed_samples": 1282304, "global_step/max_steps": "5009/6362"} +{"lm loss": 4.8792882, "grad_norm": 0.33600023, "learning_rate": 1.449e-05, "elapsed_time_per_iteration": 6.54978704, "memory(GiB)": 21.51, "elapsed_time": "9h 8m 49s", "remaining_time": "2h 28m 6s", "loss_scale": 1.0, "consumed_samples": 1282560, "global_step/max_steps": "5010/6362"} +{"lm loss": 4.89856863, "grad_norm": 0.33264318, "learning_rate": 1.448e-05, "elapsed_time_per_iteration": 6.44766378, "memory(GiB)": 21.51, "elapsed_time": "9h 8m 56s", "remaining_time": "2h 27m 59s", "loss_scale": 1.0, "consumed_samples": 1282816, "global_step/max_steps": "5011/6362"} +{"lm loss": 4.89398003, "grad_norm": 0.32336292, "learning_rate": 1.446e-05, "elapsed_time_per_iteration": 6.34378099, "memory(GiB)": 21.51, "elapsed_time": "9h 9m 2s", "remaining_time": "2h 27m 53s", "loss_scale": 1.0, "consumed_samples": 1283072, "global_step/max_steps": "5012/6362"} +{"lm loss": 4.87903786, "grad_norm": 0.32019487, "learning_rate": 1.444e-05, "elapsed_time_per_iteration": 6.60729671, "memory(GiB)": 21.51, "elapsed_time": "9h 9m 9s", "remaining_time": "2h 27m 46s", "loss_scale": 1.0, "consumed_samples": 1283328, "global_step/max_steps": "5013/6362"} +{"lm loss": 4.87166119, "grad_norm": 0.31818748, "learning_rate": 1.443e-05, "elapsed_time_per_iteration": 6.56407833, "memory(GiB)": 21.51, "elapsed_time": "9h 9m 15s", "remaining_time": "2h 27m 40s", "loss_scale": 1.0, "consumed_samples": 1283584, "global_step/max_steps": "5014/6362"} +{"lm loss": 4.89657784, "grad_norm": 0.33886611, "learning_rate": 1.441e-05, "elapsed_time_per_iteration": 6.50945306, "memory(GiB)": 21.51, "elapsed_time": "9h 9m 22s", "remaining_time": "2h 27m 33s", "loss_scale": 1.0, "consumed_samples": 1283840, "global_step/max_steps": "5015/6362"} +{"lm loss": 4.84562826, "grad_norm": 0.33174339, "learning_rate": 1.439e-05, "elapsed_time_per_iteration": 6.59883952, "memory(GiB)": 21.51, "elapsed_time": "9h 9m 28s", "remaining_time": "2h 27m 26s", "loss_scale": 1.0, "consumed_samples": 1284096, "global_step/max_steps": "5016/6362"} +{"lm loss": 4.86906862, "grad_norm": 0.33378574, "learning_rate": 1.438e-05, "elapsed_time_per_iteration": 6.51657987, "memory(GiB)": 21.51, "elapsed_time": "9h 9m 35s", "remaining_time": "2h 27m 20s", "loss_scale": 1.0, "consumed_samples": 1284352, "global_step/max_steps": "5017/6362"} +{"lm loss": 4.90466785, "grad_norm": 0.34104094, "learning_rate": 1.436e-05, "elapsed_time_per_iteration": 6.3716495, "memory(GiB)": 21.51, "elapsed_time": "9h 9m 41s", "remaining_time": "2h 27m 13s", "loss_scale": 1.0, "consumed_samples": 1284608, "global_step/max_steps": "5018/6362"} +{"lm loss": 4.87100792, "grad_norm": 0.34473556, "learning_rate": 1.435e-05, "elapsed_time_per_iteration": 6.51946282, "memory(GiB)": 21.51, "elapsed_time": "9h 9m 48s", "remaining_time": "2h 27m 7s", "loss_scale": 1.0, "consumed_samples": 1284864, "global_step/max_steps": "5019/6362"} +{"lm loss": 4.85989714, "grad_norm": 0.35874501, "learning_rate": 1.433e-05, "elapsed_time_per_iteration": 6.54080343, "memory(GiB)": 21.51, "elapsed_time": "9h 9m 54s", "remaining_time": "2h 27m 0s", "loss_scale": 1.0, "consumed_samples": 1285120, "global_step/max_steps": "5020/6362"} +{"lm loss": 4.87661028, "grad_norm": 0.3122074, "learning_rate": 1.431e-05, "elapsed_time_per_iteration": 6.35064125, "memory(GiB)": 21.51, "elapsed_time": "9h 10m 1s", "remaining_time": "2h 26m 53s", "loss_scale": 1.0, "consumed_samples": 1285376, "global_step/max_steps": "5021/6362"} +{"lm loss": 4.87394333, "grad_norm": 0.32875192, "learning_rate": 1.43e-05, "elapsed_time_per_iteration": 6.34604502, "memory(GiB)": 21.51, "elapsed_time": "9h 10m 7s", "remaining_time": "2h 26m 47s", "loss_scale": 1.0, "consumed_samples": 1285632, "global_step/max_steps": "5022/6362"} +{"lm loss": 4.87112188, "grad_norm": 0.32943958, "learning_rate": 1.428e-05, "elapsed_time_per_iteration": 6.75936937, "memory(GiB)": 21.51, "elapsed_time": "9h 10m 14s", "remaining_time": "2h 26m 40s", "loss_scale": 1.0, "consumed_samples": 1285888, "global_step/max_steps": "5023/6362"} +{"lm loss": 4.84789324, "grad_norm": 0.32240626, "learning_rate": 1.426e-05, "elapsed_time_per_iteration": 6.54947519, "memory(GiB)": 21.51, "elapsed_time": "9h 10m 20s", "remaining_time": "2h 26m 34s", "loss_scale": 1.0, "consumed_samples": 1286144, "global_step/max_steps": "5024/6362"} +{"lm loss": 4.86468792, "grad_norm": 0.30715922, "learning_rate": 1.425e-05, "elapsed_time_per_iteration": 6.47311401, "memory(GiB)": 21.51, "elapsed_time": "9h 10m 27s", "remaining_time": "2h 26m 27s", "loss_scale": 1.0, "consumed_samples": 1286400, "global_step/max_steps": "5025/6362"} +{"lm loss": 4.88021231, "grad_norm": 0.33063018, "learning_rate": 1.423e-05, "elapsed_time_per_iteration": 6.53859735, "memory(GiB)": 21.51, "elapsed_time": "9h 10m 33s", "remaining_time": "2h 26m 20s", "loss_scale": 1.0, "consumed_samples": 1286656, "global_step/max_steps": "5026/6362"} +{"lm loss": 4.87086439, "grad_norm": 0.32418442, "learning_rate": 1.422e-05, "elapsed_time_per_iteration": 6.79097486, "memory(GiB)": 21.51, "elapsed_time": "9h 10m 40s", "remaining_time": "2h 26m 14s", "loss_scale": 1.0, "consumed_samples": 1286912, "global_step/max_steps": "5027/6362"} +{"lm loss": 4.89312029, "grad_norm": 0.3325246, "learning_rate": 1.42e-05, "elapsed_time_per_iteration": 6.79986405, "memory(GiB)": 21.51, "elapsed_time": "9h 10m 47s", "remaining_time": "2h 26m 7s", "loss_scale": 1.0, "consumed_samples": 1287168, "global_step/max_steps": "5028/6362"} +{"lm loss": 4.87606859, "grad_norm": 0.31579486, "learning_rate": 1.418e-05, "elapsed_time_per_iteration": 6.58048892, "memory(GiB)": 21.51, "elapsed_time": "9h 10m 53s", "remaining_time": "2h 26m 1s", "loss_scale": 1.0, "consumed_samples": 1287424, "global_step/max_steps": "5029/6362"} +{"lm loss": 4.87048483, "grad_norm": 0.31479922, "learning_rate": 1.417e-05, "elapsed_time_per_iteration": 6.54318237, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 0s", "remaining_time": "2h 25m 54s", "loss_scale": 1.0, "consumed_samples": 1287680, "global_step/max_steps": "5030/6362"} +{"lm loss": 4.883389, "grad_norm": 0.32438359, "learning_rate": 1.415e-05, "elapsed_time_per_iteration": 6.43779707, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 6s", "remaining_time": "2h 25m 48s", "loss_scale": 1.0, "consumed_samples": 1287936, "global_step/max_steps": "5031/6362"} +{"lm loss": 4.89178753, "grad_norm": 0.32439741, "learning_rate": 1.414e-05, "elapsed_time_per_iteration": 6.57805848, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 13s", "remaining_time": "2h 25m 41s", "loss_scale": 1.0, "consumed_samples": 1288192, "global_step/max_steps": "5032/6362"} +{"lm loss": 4.89482164, "grad_norm": 0.3285594, "learning_rate": 1.412e-05, "elapsed_time_per_iteration": 6.66121602, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 20s", "remaining_time": "2h 25m 35s", "loss_scale": 1.0, "consumed_samples": 1288448, "global_step/max_steps": "5033/6362"} +{"lm loss": 4.88426685, "grad_norm": 0.31638148, "learning_rate": 1.41e-05, "elapsed_time_per_iteration": 6.7980597, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 26s", "remaining_time": "2h 25m 28s", "loss_scale": 1.0, "consumed_samples": 1288704, "global_step/max_steps": "5034/6362"} +{"lm loss": 4.87647581, "grad_norm": 0.32945755, "learning_rate": 1.409e-05, "elapsed_time_per_iteration": 6.376894, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 33s", "remaining_time": "2h 25m 21s", "loss_scale": 1.0, "consumed_samples": 1288960, "global_step/max_steps": "5035/6362"} +{"lm loss": 4.86449862, "grad_norm": 0.32948044, "learning_rate": 1.407e-05, "elapsed_time_per_iteration": 6.7213974, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 40s", "remaining_time": "2h 25m 15s", "loss_scale": 1.0, "consumed_samples": 1289216, "global_step/max_steps": "5036/6362"} +{"lm loss": 4.8862772, "grad_norm": 0.34118408, "learning_rate": 1.406e-05, "elapsed_time_per_iteration": 6.7735815, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 46s", "remaining_time": "2h 25m 8s", "loss_scale": 1.0, "consumed_samples": 1289472, "global_step/max_steps": "5037/6362"} +{"lm loss": 4.86539888, "grad_norm": 0.30979425, "learning_rate": 1.404e-05, "elapsed_time_per_iteration": 6.46280384, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 53s", "remaining_time": "2h 25m 2s", "loss_scale": 1.0, "consumed_samples": 1289728, "global_step/max_steps": "5038/6362"} +{"lm loss": 4.86811304, "grad_norm": 0.33156335, "learning_rate": 1.402e-05, "elapsed_time_per_iteration": 6.70140958, "memory(GiB)": 21.51, "elapsed_time": "9h 11m 59s", "remaining_time": "2h 24m 55s", "loss_scale": 1.0, "consumed_samples": 1289984, "global_step/max_steps": "5039/6362"} +{"lm loss": 4.87003994, "grad_norm": 0.33652291, "learning_rate": 1.401e-05, "elapsed_time_per_iteration": 6.72040176, "memory(GiB)": 21.51, "elapsed_time": "9h 12m 6s", "remaining_time": "2h 24m 49s", "loss_scale": 1.0, "consumed_samples": 1290240, "global_step/max_steps": "5040/6362"} +{"lm loss": 4.87556458, "grad_norm": 0.30823016, "learning_rate": 1.399e-05, "elapsed_time_per_iteration": 6.69566774, "memory(GiB)": 21.51, "elapsed_time": "9h 12m 13s", "remaining_time": "2h 24m 42s", "loss_scale": 1.0, "consumed_samples": 1290496, "global_step/max_steps": "5041/6362"} +{"lm loss": 4.89256477, "grad_norm": 0.35824254, "learning_rate": 1.398e-05, "elapsed_time_per_iteration": 6.53960514, "memory(GiB)": 21.51, "elapsed_time": "9h 12m 19s", "remaining_time": "2h 24m 36s", "loss_scale": 1.0, "consumed_samples": 1290752, "global_step/max_steps": "5042/6362"} +{"lm loss": 4.87315416, "grad_norm": 0.33656347, "learning_rate": 1.396e-05, "elapsed_time_per_iteration": 7.36147046, "memory(GiB)": 21.51, "elapsed_time": "9h 12m 27s", "remaining_time": "2h 24m 29s", "loss_scale": 1.0, "consumed_samples": 1291008, "global_step/max_steps": "5043/6362"} +{"lm loss": 4.86998796, "grad_norm": 0.34752816, "learning_rate": 1.394e-05, "elapsed_time_per_iteration": 6.54993916, "memory(GiB)": 21.51, "elapsed_time": "9h 12m 33s", "remaining_time": "2h 24m 23s", "loss_scale": 1.0, "consumed_samples": 1291264, "global_step/max_steps": "5044/6362"} +{"lm loss": 4.88058329, "grad_norm": 0.33465347, "learning_rate": 1.393e-05, "elapsed_time_per_iteration": 6.53109622, "memory(GiB)": 21.51, "elapsed_time": "9h 12m 40s", "remaining_time": "2h 24m 16s", "loss_scale": 1.0, "consumed_samples": 1291520, "global_step/max_steps": "5045/6362"} +{"lm loss": 4.87618542, "grad_norm": 0.32869753, "learning_rate": 1.391e-05, "elapsed_time_per_iteration": 6.41276026, "memory(GiB)": 21.51, "elapsed_time": "9h 12m 46s", "remaining_time": "2h 24m 9s", "loss_scale": 1.0, "consumed_samples": 1291776, "global_step/max_steps": "5046/6362"} +{"lm loss": 4.90212774, "grad_norm": 0.3527227, "learning_rate": 1.39e-05, "elapsed_time_per_iteration": 6.492028, "memory(GiB)": 21.51, "elapsed_time": "9h 12m 53s", "remaining_time": "2h 24m 3s", "loss_scale": 1.0, "consumed_samples": 1292032, "global_step/max_steps": "5047/6362"} +{"lm loss": 4.85894203, "grad_norm": 0.34342575, "learning_rate": 1.388e-05, "elapsed_time_per_iteration": 6.5553019, "memory(GiB)": 21.51, "elapsed_time": "9h 12m 59s", "remaining_time": "2h 23m 56s", "loss_scale": 1.0, "consumed_samples": 1292288, "global_step/max_steps": "5048/6362"} +{"lm loss": 4.88286638, "grad_norm": 0.33457029, "learning_rate": 1.386e-05, "elapsed_time_per_iteration": 6.6991415, "memory(GiB)": 21.51, "elapsed_time": "9h 13m 6s", "remaining_time": "2h 23m 50s", "loss_scale": 1.0, "consumed_samples": 1292544, "global_step/max_steps": "5049/6362"} +{"lm loss": 4.87325144, "grad_norm": 0.32237938, "learning_rate": 1.385e-05, "elapsed_time_per_iteration": 6.52237201, "memory(GiB)": 21.51, "elapsed_time": "9h 13m 13s", "remaining_time": "2h 23m 43s", "loss_scale": 1.0, "consumed_samples": 1292800, "global_step/max_steps": "5050/6362"} +{"lm loss": 4.85248041, "grad_norm": 0.34251329, "learning_rate": 1.383e-05, "elapsed_time_per_iteration": 6.61676717, "memory(GiB)": 21.51, "elapsed_time": "9h 13m 19s", "remaining_time": "2h 23m 37s", "loss_scale": 1.0, "consumed_samples": 1293056, "global_step/max_steps": "5051/6362"} +{"lm loss": 4.87039852, "grad_norm": 0.33479118, "learning_rate": 1.382e-05, "elapsed_time_per_iteration": 6.77035046, "memory(GiB)": 21.51, "elapsed_time": "9h 13m 26s", "remaining_time": "2h 23m 30s", "loss_scale": 1.0, "consumed_samples": 1293312, "global_step/max_steps": "5052/6362"} +{"lm loss": 4.88587284, "grad_norm": 0.36865541, "learning_rate": 1.38e-05, "elapsed_time_per_iteration": 6.55295491, "memory(GiB)": 21.51, "elapsed_time": "9h 13m 32s", "remaining_time": "2h 23m 23s", "loss_scale": 1.0, "consumed_samples": 1293568, "global_step/max_steps": "5053/6362"} +{"lm loss": 4.86919165, "grad_norm": 0.33167943, "learning_rate": 1.378e-05, "elapsed_time_per_iteration": 6.64635301, "memory(GiB)": 21.51, "elapsed_time": "9h 13m 39s", "remaining_time": "2h 23m 17s", "loss_scale": 1.0, "consumed_samples": 1293824, "global_step/max_steps": "5054/6362"} +{"lm loss": 4.89378262, "grad_norm": 0.36697203, "learning_rate": 1.377e-05, "elapsed_time_per_iteration": 6.65642023, "memory(GiB)": 21.51, "elapsed_time": "9h 13m 46s", "remaining_time": "2h 23m 10s", "loss_scale": 1.0, "consumed_samples": 1294080, "global_step/max_steps": "5055/6362"} +{"lm loss": 4.87565041, "grad_norm": 0.30867699, "learning_rate": 1.375e-05, "elapsed_time_per_iteration": 6.33027577, "memory(GiB)": 21.51, "elapsed_time": "9h 13m 52s", "remaining_time": "2h 23m 4s", "loss_scale": 1.0, "consumed_samples": 1294336, "global_step/max_steps": "5056/6362"} +{"lm loss": 4.8952961, "grad_norm": 0.34639117, "learning_rate": 1.374e-05, "elapsed_time_per_iteration": 6.50328302, "memory(GiB)": 21.51, "elapsed_time": "9h 13m 59s", "remaining_time": "2h 22m 57s", "loss_scale": 1.0, "consumed_samples": 1294592, "global_step/max_steps": "5057/6362"} +{"lm loss": 4.87442684, "grad_norm": 0.31595528, "learning_rate": 1.372e-05, "elapsed_time_per_iteration": 6.46393275, "memory(GiB)": 21.51, "elapsed_time": "9h 14m 5s", "remaining_time": "2h 22m 51s", "loss_scale": 1.0, "consumed_samples": 1294848, "global_step/max_steps": "5058/6362"} +{"lm loss": 4.8853941, "grad_norm": 0.32827523, "learning_rate": 1.371e-05, "elapsed_time_per_iteration": 6.44558406, "memory(GiB)": 21.51, "elapsed_time": "9h 14m 12s", "remaining_time": "2h 22m 44s", "loss_scale": 1.0, "consumed_samples": 1295104, "global_step/max_steps": "5059/6362"} +{"lm loss": 4.88794756, "grad_norm": 0.34892544, "learning_rate": 1.369e-05, "elapsed_time_per_iteration": 6.4106288, "memory(GiB)": 21.51, "elapsed_time": "9h 14m 18s", "remaining_time": "2h 22m 37s", "loss_scale": 1.0, "consumed_samples": 1295360, "global_step/max_steps": "5060/6362"} +{"lm loss": 4.87687778, "grad_norm": 0.34135199, "learning_rate": 1.367e-05, "elapsed_time_per_iteration": 6.62078881, "memory(GiB)": 21.51, "elapsed_time": "9h 14m 25s", "remaining_time": "2h 22m 31s", "loss_scale": 1.0, "consumed_samples": 1295616, "global_step/max_steps": "5061/6362"} +{"lm loss": 4.87362957, "grad_norm": 0.34312403, "learning_rate": 1.366e-05, "elapsed_time_per_iteration": 6.40295148, "memory(GiB)": 21.51, "elapsed_time": "9h 14m 31s", "remaining_time": "2h 22m 24s", "loss_scale": 1.0, "consumed_samples": 1295872, "global_step/max_steps": "5062/6362"} +{"lm loss": 4.88032055, "grad_norm": 0.34724492, "learning_rate": 1.364e-05, "elapsed_time_per_iteration": 6.66082573, "memory(GiB)": 21.51, "elapsed_time": "9h 14m 38s", "remaining_time": "2h 22m 18s", "loss_scale": 1.0, "consumed_samples": 1296128, "global_step/max_steps": "5063/6362"} +{"lm loss": 4.87014723, "grad_norm": 0.32552108, "learning_rate": 1.363e-05, "elapsed_time_per_iteration": 6.42292023, "memory(GiB)": 21.51, "elapsed_time": "9h 14m 44s", "remaining_time": "2h 22m 11s", "loss_scale": 1.0, "consumed_samples": 1296384, "global_step/max_steps": "5064/6362"} +{"lm loss": 4.88307953, "grad_norm": 0.3412948, "learning_rate": 1.361e-05, "elapsed_time_per_iteration": 6.68514848, "memory(GiB)": 21.51, "elapsed_time": "9h 14m 51s", "remaining_time": "2h 22m 4s", "loss_scale": 1.0, "consumed_samples": 1296640, "global_step/max_steps": "5065/6362"} +{"lm loss": 4.87506533, "grad_norm": 0.32017058, "learning_rate": 1.359e-05, "elapsed_time_per_iteration": 6.56030607, "memory(GiB)": 21.51, "elapsed_time": "9h 14m 57s", "remaining_time": "2h 21m 58s", "loss_scale": 1.0, "consumed_samples": 1296896, "global_step/max_steps": "5066/6362"} +{"lm loss": 4.86955786, "grad_norm": 0.35419968, "learning_rate": 1.358e-05, "elapsed_time_per_iteration": 6.6269238, "memory(GiB)": 21.51, "elapsed_time": "9h 15m 4s", "remaining_time": "2h 21m 51s", "loss_scale": 1.0, "consumed_samples": 1297152, "global_step/max_steps": "5067/6362"} +{"lm loss": 4.86287355, "grad_norm": 0.32742608, "learning_rate": 1.356e-05, "elapsed_time_per_iteration": 6.38442397, "memory(GiB)": 21.51, "elapsed_time": "9h 15m 10s", "remaining_time": "2h 21m 45s", "loss_scale": 1.0, "consumed_samples": 1297408, "global_step/max_steps": "5068/6362"} +{"lm loss": 4.86189938, "grad_norm": 0.34658247, "learning_rate": 1.355e-05, "elapsed_time_per_iteration": 6.48192692, "memory(GiB)": 21.51, "elapsed_time": "9h 15m 17s", "remaining_time": "2h 21m 38s", "loss_scale": 1.0, "consumed_samples": 1297664, "global_step/max_steps": "5069/6362"} +{"lm loss": 4.87759876, "grad_norm": 0.33859921, "learning_rate": 1.353e-05, "elapsed_time_per_iteration": 6.75845361, "memory(GiB)": 21.51, "elapsed_time": "9h 15m 24s", "remaining_time": "2h 21m 32s", "loss_scale": 1.0, "consumed_samples": 1297920, "global_step/max_steps": "5070/6362"} +{"lm loss": 4.88738251, "grad_norm": 0.34328976, "learning_rate": 1.352e-05, "elapsed_time_per_iteration": 6.61792731, "memory(GiB)": 21.51, "elapsed_time": "9h 15m 30s", "remaining_time": "2h 21m 25s", "loss_scale": 1.0, "consumed_samples": 1298176, "global_step/max_steps": "5071/6362"} +{"lm loss": 4.85643435, "grad_norm": 0.33448881, "learning_rate": 1.35e-05, "elapsed_time_per_iteration": 6.6068747, "memory(GiB)": 21.51, "elapsed_time": "9h 15m 37s", "remaining_time": "2h 21m 18s", "loss_scale": 1.0, "consumed_samples": 1298432, "global_step/max_steps": "5072/6362"} +{"lm loss": 4.88479519, "grad_norm": 0.31537047, "learning_rate": 1.349e-05, "elapsed_time_per_iteration": 6.79161, "memory(GiB)": 21.51, "elapsed_time": "9h 15m 44s", "remaining_time": "2h 21m 12s", "loss_scale": 1.0, "consumed_samples": 1298688, "global_step/max_steps": "5073/6362"} +{"lm loss": 4.86916542, "grad_norm": 0.32674584, "learning_rate": 1.347e-05, "elapsed_time_per_iteration": 6.53050971, "memory(GiB)": 21.51, "elapsed_time": "9h 15m 50s", "remaining_time": "2h 21m 5s", "loss_scale": 1.0, "consumed_samples": 1298944, "global_step/max_steps": "5074/6362"} +{"lm loss": 4.87113857, "grad_norm": 0.31543407, "learning_rate": 1.345e-05, "elapsed_time_per_iteration": 6.47068524, "memory(GiB)": 21.51, "elapsed_time": "9h 15m 57s", "remaining_time": "2h 20m 59s", "loss_scale": 1.0, "consumed_samples": 1299200, "global_step/max_steps": "5075/6362"} +{"lm loss": 4.89601183, "grad_norm": 0.31531459, "learning_rate": 1.344e-05, "elapsed_time_per_iteration": 6.5742588, "memory(GiB)": 21.51, "elapsed_time": "9h 16m 3s", "remaining_time": "2h 20m 52s", "loss_scale": 1.0, "consumed_samples": 1299456, "global_step/max_steps": "5076/6362"} +{"lm loss": 4.886724, "grad_norm": 0.32194147, "learning_rate": 1.342e-05, "elapsed_time_per_iteration": 6.29664564, "memory(GiB)": 21.51, "elapsed_time": "9h 16m 9s", "remaining_time": "2h 20m 45s", "loss_scale": 1.0, "consumed_samples": 1299712, "global_step/max_steps": "5077/6362"} +{"lm loss": 4.87903547, "grad_norm": 0.30222926, "learning_rate": 1.341e-05, "elapsed_time_per_iteration": 6.71036315, "memory(GiB)": 21.51, "elapsed_time": "9h 16m 16s", "remaining_time": "2h 20m 39s", "loss_scale": 1.0, "consumed_samples": 1299968, "global_step/max_steps": "5078/6362"} +{"lm loss": 4.89059639, "grad_norm": 0.31308478, "learning_rate": 1.339e-05, "elapsed_time_per_iteration": 6.70691228, "memory(GiB)": 21.51, "elapsed_time": "9h 16m 23s", "remaining_time": "2h 20m 32s", "loss_scale": 1.0, "consumed_samples": 1300224, "global_step/max_steps": "5079/6362"} +{"lm loss": 4.88639259, "grad_norm": 0.29759592, "learning_rate": 1.338e-05, "elapsed_time_per_iteration": 6.6828208, "memory(GiB)": 21.51, "elapsed_time": "9h 16m 30s", "remaining_time": "2h 20m 26s", "loss_scale": 1.0, "consumed_samples": 1300480, "global_step/max_steps": "5080/6362"} +{"lm loss": 4.89995575, "grad_norm": 0.30320165, "learning_rate": 1.336e-05, "elapsed_time_per_iteration": 6.64269948, "memory(GiB)": 21.51, "elapsed_time": "9h 16m 36s", "remaining_time": "2h 20m 19s", "loss_scale": 1.0, "consumed_samples": 1300736, "global_step/max_steps": "5081/6362"} +{"lm loss": 4.86167145, "grad_norm": 0.31481299, "learning_rate": 1.334e-05, "elapsed_time_per_iteration": 6.63915944, "memory(GiB)": 21.51, "elapsed_time": "9h 16m 43s", "remaining_time": "2h 20m 13s", "loss_scale": 1.0, "consumed_samples": 1300992, "global_step/max_steps": "5082/6362"} +{"lm loss": 4.88567495, "grad_norm": 0.30329812, "learning_rate": 1.333e-05, "elapsed_time_per_iteration": 6.46150541, "memory(GiB)": 21.51, "elapsed_time": "9h 16m 49s", "remaining_time": "2h 20m 6s", "loss_scale": 1.0, "consumed_samples": 1301248, "global_step/max_steps": "5083/6362"} +{"lm loss": 4.88658762, "grad_norm": 0.30801356, "learning_rate": 1.331e-05, "elapsed_time_per_iteration": 6.61589837, "memory(GiB)": 21.51, "elapsed_time": "9h 16m 56s", "remaining_time": "2h 20m 0s", "loss_scale": 1.0, "consumed_samples": 1301504, "global_step/max_steps": "5084/6362"} +{"lm loss": 4.88653564, "grad_norm": 0.32360351, "learning_rate": 1.33e-05, "elapsed_time_per_iteration": 6.70599127, "memory(GiB)": 21.51, "elapsed_time": "9h 17m 3s", "remaining_time": "2h 19m 53s", "loss_scale": 1.0, "consumed_samples": 1301760, "global_step/max_steps": "5085/6362"} +{"lm loss": 4.84296942, "grad_norm": 0.32652664, "learning_rate": 1.328e-05, "elapsed_time_per_iteration": 6.54739475, "memory(GiB)": 21.51, "elapsed_time": "9h 17m 9s", "remaining_time": "2h 19m 46s", "loss_scale": 1.0, "consumed_samples": 1302016, "global_step/max_steps": "5086/6362"} +{"lm loss": 4.8704524, "grad_norm": 0.35728407, "learning_rate": 1.327e-05, "elapsed_time_per_iteration": 6.50942707, "memory(GiB)": 21.51, "elapsed_time": "9h 17m 16s", "remaining_time": "2h 19m 40s", "loss_scale": 1.0, "consumed_samples": 1302272, "global_step/max_steps": "5087/6362"} +{"lm loss": 4.86459208, "grad_norm": 0.34160089, "learning_rate": 1.325e-05, "elapsed_time_per_iteration": 6.42303443, "memory(GiB)": 21.51, "elapsed_time": "9h 17m 22s", "remaining_time": "2h 19m 33s", "loss_scale": 1.0, "consumed_samples": 1302528, "global_step/max_steps": "5088/6362"} +{"lm loss": 4.88860226, "grad_norm": 0.32323286, "learning_rate": 1.324e-05, "elapsed_time_per_iteration": 6.48940253, "memory(GiB)": 21.51, "elapsed_time": "9h 17m 29s", "remaining_time": "2h 19m 27s", "loss_scale": 1.0, "consumed_samples": 1302784, "global_step/max_steps": "5089/6362"} +{"lm loss": 4.90135336, "grad_norm": 0.33819777, "learning_rate": 1.322e-05, "elapsed_time_per_iteration": 6.87870216, "memory(GiB)": 21.51, "elapsed_time": "9h 17m 35s", "remaining_time": "2h 19m 20s", "loss_scale": 1.0, "consumed_samples": 1303040, "global_step/max_steps": "5090/6362"} +{"lm loss": 4.88214684, "grad_norm": 0.34404656, "learning_rate": 1.321e-05, "elapsed_time_per_iteration": 6.50658226, "memory(GiB)": 21.51, "elapsed_time": "9h 17m 42s", "remaining_time": "2h 19m 14s", "loss_scale": 1.0, "consumed_samples": 1303296, "global_step/max_steps": "5091/6362"} +{"lm loss": 4.89140558, "grad_norm": 0.33205378, "learning_rate": 1.319e-05, "elapsed_time_per_iteration": 6.53090572, "memory(GiB)": 21.51, "elapsed_time": "9h 17m 48s", "remaining_time": "2h 19m 7s", "loss_scale": 1.0, "consumed_samples": 1303552, "global_step/max_steps": "5092/6362"} +{"lm loss": 4.88771868, "grad_norm": 0.32782468, "learning_rate": 1.317e-05, "elapsed_time_per_iteration": 7.10042453, "memory(GiB)": 21.51, "elapsed_time": "9h 17m 56s", "remaining_time": "2h 19m 1s", "loss_scale": 1.0, "consumed_samples": 1303808, "global_step/max_steps": "5093/6362"} +{"lm loss": 4.86278629, "grad_norm": 0.33333907, "learning_rate": 1.316e-05, "elapsed_time_per_iteration": 6.61259556, "memory(GiB)": 21.51, "elapsed_time": "9h 18m 2s", "remaining_time": "2h 18m 54s", "loss_scale": 1.0, "consumed_samples": 1304064, "global_step/max_steps": "5094/6362"} +{"lm loss": 4.8695879, "grad_norm": 0.31848183, "learning_rate": 1.314e-05, "elapsed_time_per_iteration": 6.40919662, "memory(GiB)": 21.51, "elapsed_time": "9h 18m 9s", "remaining_time": "2h 18m 47s", "loss_scale": 1.0, "consumed_samples": 1304320, "global_step/max_steps": "5095/6362"} +{"lm loss": 4.86304379, "grad_norm": 0.31735888, "learning_rate": 1.313e-05, "elapsed_time_per_iteration": 6.44580507, "memory(GiB)": 21.51, "elapsed_time": "9h 18m 15s", "remaining_time": "2h 18m 41s", "loss_scale": 1.0, "consumed_samples": 1304576, "global_step/max_steps": "5096/6362"} +{"lm loss": 4.87405777, "grad_norm": 0.33630034, "learning_rate": 1.311e-05, "elapsed_time_per_iteration": 6.65887904, "memory(GiB)": 21.51, "elapsed_time": "9h 18m 22s", "remaining_time": "2h 18m 34s", "loss_scale": 1.0, "consumed_samples": 1304832, "global_step/max_steps": "5097/6362"} +{"lm loss": 4.85838509, "grad_norm": 0.29077035, "learning_rate": 1.31e-05, "elapsed_time_per_iteration": 6.6046226, "memory(GiB)": 21.51, "elapsed_time": "9h 18m 28s", "remaining_time": "2h 18m 28s", "loss_scale": 1.0, "consumed_samples": 1305088, "global_step/max_steps": "5098/6362"} +{"lm loss": 4.89815283, "grad_norm": 0.33051854, "learning_rate": 1.308e-05, "elapsed_time_per_iteration": 6.4853971, "memory(GiB)": 21.51, "elapsed_time": "9h 18m 35s", "remaining_time": "2h 18m 21s", "loss_scale": 1.0, "consumed_samples": 1305344, "global_step/max_steps": "5099/6362"} +{"lm loss": 4.8637166, "grad_norm": 0.30024797, "learning_rate": 1.307e-05, "elapsed_time_per_iteration": 6.85844231, "memory(GiB)": 21.51, "elapsed_time": "9h 18m 42s", "remaining_time": "2h 18m 15s", "loss_scale": 1.0, "consumed_samples": 1305600, "global_step/max_steps": "5100/6362"} +{"lm loss": 4.88102245, "grad_norm": 0.3221612, "learning_rate": 1.305e-05, "elapsed_time_per_iteration": 6.64939547, "memory(GiB)": 21.51, "elapsed_time": "9h 18m 48s", "remaining_time": "2h 18m 8s", "loss_scale": 1.0, "consumed_samples": 1305856, "global_step/max_steps": "5101/6362"} +{"lm loss": 4.86233091, "grad_norm": 0.29275626, "learning_rate": 1.304e-05, "elapsed_time_per_iteration": 6.59655809, "memory(GiB)": 21.51, "elapsed_time": "9h 18m 55s", "remaining_time": "2h 18m 1s", "loss_scale": 1.0, "consumed_samples": 1306112, "global_step/max_steps": "5102/6362"} +{"lm loss": 4.87000704, "grad_norm": 0.315714, "learning_rate": 1.302e-05, "elapsed_time_per_iteration": 6.56756997, "memory(GiB)": 21.51, "elapsed_time": "9h 19m 1s", "remaining_time": "2h 17m 55s", "loss_scale": 1.0, "consumed_samples": 1306368, "global_step/max_steps": "5103/6362"} +{"lm loss": 4.87634754, "grad_norm": 0.30649364, "learning_rate": 1.3e-05, "elapsed_time_per_iteration": 6.54350495, "memory(GiB)": 21.51, "elapsed_time": "9h 19m 8s", "remaining_time": "2h 17m 48s", "loss_scale": 1.0, "consumed_samples": 1306624, "global_step/max_steps": "5104/6362"} +{"lm loss": 4.88543606, "grad_norm": 0.32608417, "learning_rate": 1.299e-05, "elapsed_time_per_iteration": 6.60119677, "memory(GiB)": 21.51, "elapsed_time": "9h 19m 15s", "remaining_time": "2h 17m 42s", "loss_scale": 1.0, "consumed_samples": 1306880, "global_step/max_steps": "5105/6362"} +{"lm loss": 4.85694647, "grad_norm": 0.29442507, "learning_rate": 1.297e-05, "elapsed_time_per_iteration": 6.57843232, "memory(GiB)": 21.51, "elapsed_time": "9h 19m 21s", "remaining_time": "2h 17m 35s", "loss_scale": 1.0, "consumed_samples": 1307136, "global_step/max_steps": "5106/6362"} +{"lm loss": 4.87361193, "grad_norm": 0.31149393, "learning_rate": 1.296e-05, "elapsed_time_per_iteration": 6.57269597, "memory(GiB)": 21.51, "elapsed_time": "9h 19m 28s", "remaining_time": "2h 17m 29s", "loss_scale": 1.0, "consumed_samples": 1307392, "global_step/max_steps": "5107/6362"} +{"lm loss": 4.85962391, "grad_norm": 0.31234965, "learning_rate": 1.294e-05, "elapsed_time_per_iteration": 6.47388649, "memory(GiB)": 21.51, "elapsed_time": "9h 19m 34s", "remaining_time": "2h 17m 22s", "loss_scale": 1.0, "consumed_samples": 1307648, "global_step/max_steps": "5108/6362"} +{"lm loss": 4.90243006, "grad_norm": 0.32035649, "learning_rate": 1.293e-05, "elapsed_time_per_iteration": 6.45982814, "memory(GiB)": 21.51, "elapsed_time": "9h 19m 41s", "remaining_time": "2h 17m 15s", "loss_scale": 1.0, "consumed_samples": 1307904, "global_step/max_steps": "5109/6362"} +{"lm loss": 4.85934925, "grad_norm": 0.29893765, "learning_rate": 1.291e-05, "elapsed_time_per_iteration": 6.52513361, "memory(GiB)": 21.51, "elapsed_time": "9h 19m 47s", "remaining_time": "2h 17m 9s", "loss_scale": 1.0, "consumed_samples": 1308160, "global_step/max_steps": "5110/6362"} +{"lm loss": 4.89168739, "grad_norm": 0.33661777, "learning_rate": 1.29e-05, "elapsed_time_per_iteration": 6.5734005, "memory(GiB)": 21.51, "elapsed_time": "9h 19m 54s", "remaining_time": "2h 17m 2s", "loss_scale": 1.0, "consumed_samples": 1308416, "global_step/max_steps": "5111/6362"} +{"lm loss": 4.8844986, "grad_norm": 0.31700537, "learning_rate": 1.288e-05, "elapsed_time_per_iteration": 6.59675574, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 0s", "remaining_time": "2h 16m 56s", "loss_scale": 1.0, "consumed_samples": 1308672, "global_step/max_steps": "5112/6362"} +{"lm loss": 4.87885141, "grad_norm": 0.32442629, "learning_rate": 1.287e-05, "elapsed_time_per_iteration": 6.59541798, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 7s", "remaining_time": "2h 16m 49s", "loss_scale": 1.0, "consumed_samples": 1308928, "global_step/max_steps": "5113/6362"} +{"lm loss": 4.8531251, "grad_norm": 0.31596658, "learning_rate": 1.285e-05, "elapsed_time_per_iteration": 6.64487147, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 14s", "remaining_time": "2h 16m 43s", "loss_scale": 1.0, "consumed_samples": 1309184, "global_step/max_steps": "5114/6362"} +{"lm loss": 4.88482761, "grad_norm": 0.3200382, "learning_rate": 1.284e-05, "elapsed_time_per_iteration": 6.4559052, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 20s", "remaining_time": "2h 16m 36s", "loss_scale": 1.0, "consumed_samples": 1309440, "global_step/max_steps": "5115/6362"} +{"lm loss": 4.87255144, "grad_norm": 0.31424138, "learning_rate": 1.282e-05, "elapsed_time_per_iteration": 6.5237639, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 27s", "remaining_time": "2h 16m 29s", "loss_scale": 1.0, "consumed_samples": 1309696, "global_step/max_steps": "5116/6362"} +{"lm loss": 4.88231468, "grad_norm": 0.33245522, "learning_rate": 1.281e-05, "elapsed_time_per_iteration": 6.45943666, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 33s", "remaining_time": "2h 16m 23s", "loss_scale": 1.0, "consumed_samples": 1309952, "global_step/max_steps": "5117/6362"} +{"lm loss": 4.88063669, "grad_norm": 0.32415813, "learning_rate": 1.279e-05, "elapsed_time_per_iteration": 6.35782051, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 39s", "remaining_time": "2h 16m 16s", "loss_scale": 1.0, "consumed_samples": 1310208, "global_step/max_steps": "5118/6362"} +{"lm loss": 4.8742485, "grad_norm": 0.32656765, "learning_rate": 1.278e-05, "elapsed_time_per_iteration": 6.3372972, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 46s", "remaining_time": "2h 16m 10s", "loss_scale": 1.0, "consumed_samples": 1310464, "global_step/max_steps": "5119/6362"} +{"lm loss": 4.88881922, "grad_norm": 0.30898568, "learning_rate": 1.276e-05, "elapsed_time_per_iteration": 6.63365698, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 52s", "remaining_time": "2h 16m 3s", "loss_scale": 1.0, "consumed_samples": 1310720, "global_step/max_steps": "5120/6362"} +{"lm loss": 4.87719822, "grad_norm": 0.3095361, "learning_rate": 1.275e-05, "elapsed_time_per_iteration": 6.32923889, "memory(GiB)": 21.51, "elapsed_time": "9h 20m 59s", "remaining_time": "2h 15m 56s", "loss_scale": 1.0, "consumed_samples": 1310976, "global_step/max_steps": "5121/6362"} +{"lm loss": 4.8712697, "grad_norm": 0.31358537, "learning_rate": 1.273e-05, "elapsed_time_per_iteration": 6.52456307, "memory(GiB)": 21.51, "elapsed_time": "9h 21m 5s", "remaining_time": "2h 15m 50s", "loss_scale": 1.0, "consumed_samples": 1311232, "global_step/max_steps": "5122/6362"} +{"lm loss": 4.85334158, "grad_norm": 0.31814975, "learning_rate": 1.272e-05, "elapsed_time_per_iteration": 6.72068167, "memory(GiB)": 21.51, "elapsed_time": "9h 21m 12s", "remaining_time": "2h 15m 43s", "loss_scale": 1.0, "consumed_samples": 1311488, "global_step/max_steps": "5123/6362"} +{"lm loss": 4.8470459, "grad_norm": 0.31894115, "learning_rate": 1.27e-05, "elapsed_time_per_iteration": 6.57082105, "memory(GiB)": 21.51, "elapsed_time": "9h 21m 19s", "remaining_time": "2h 15m 37s", "loss_scale": 1.0, "consumed_samples": 1311744, "global_step/max_steps": "5124/6362"} +{"lm loss": 4.88646269, "grad_norm": 0.33650729, "learning_rate": 1.269e-05, "elapsed_time_per_iteration": 6.5951829, "memory(GiB)": 21.51, "elapsed_time": "9h 21m 25s", "remaining_time": "2h 15m 30s", "loss_scale": 1.0, "consumed_samples": 1312000, "global_step/max_steps": "5125/6362"} +{"lm loss": 4.88171482, "grad_norm": 0.30289975, "learning_rate": 1.267e-05, "elapsed_time_per_iteration": 6.35867906, "memory(GiB)": 21.51, "elapsed_time": "9h 21m 31s", "remaining_time": "2h 15m 23s", "loss_scale": 1.0, "consumed_samples": 1312256, "global_step/max_steps": "5126/6362"} +{"lm loss": 4.88365126, "grad_norm": 0.31243813, "learning_rate": 1.265e-05, "elapsed_time_per_iteration": 6.30597758, "memory(GiB)": 21.51, "elapsed_time": "9h 21m 38s", "remaining_time": "2h 15m 17s", "loss_scale": 1.0, "consumed_samples": 1312512, "global_step/max_steps": "5127/6362"} +{"lm loss": 4.88180733, "grad_norm": 0.34054556, "learning_rate": 1.264e-05, "elapsed_time_per_iteration": 6.35438681, "memory(GiB)": 21.51, "elapsed_time": "9h 21m 44s", "remaining_time": "2h 15m 10s", "loss_scale": 1.0, "consumed_samples": 1312768, "global_step/max_steps": "5128/6362"} +{"lm loss": 4.88093758, "grad_norm": 0.31091124, "learning_rate": 1.262e-05, "elapsed_time_per_iteration": 6.47139263, "memory(GiB)": 21.51, "elapsed_time": "9h 21m 51s", "remaining_time": "2h 15m 4s", "loss_scale": 1.0, "consumed_samples": 1313024, "global_step/max_steps": "5129/6362"} +{"lm loss": 4.865345, "grad_norm": 0.3008126, "learning_rate": 1.261e-05, "elapsed_time_per_iteration": 6.25478292, "memory(GiB)": 21.51, "elapsed_time": "9h 21m 57s", "remaining_time": "2h 14m 57s", "loss_scale": 1.0, "consumed_samples": 1313280, "global_step/max_steps": "5130/6362"} +{"lm loss": 4.87633657, "grad_norm": 0.32950279, "learning_rate": 1.259e-05, "elapsed_time_per_iteration": 6.52231908, "memory(GiB)": 21.51, "elapsed_time": "9h 22m 3s", "remaining_time": "2h 14m 50s", "loss_scale": 1.0, "consumed_samples": 1313536, "global_step/max_steps": "5131/6362"} +{"lm loss": 4.88476467, "grad_norm": 0.32474723, "learning_rate": 1.258e-05, "elapsed_time_per_iteration": 6.81638646, "memory(GiB)": 21.51, "elapsed_time": "9h 22m 10s", "remaining_time": "2h 14m 44s", "loss_scale": 1.0, "consumed_samples": 1313792, "global_step/max_steps": "5132/6362"} +{"lm loss": 4.87301874, "grad_norm": 0.31249955, "learning_rate": 1.256e-05, "elapsed_time_per_iteration": 6.48376298, "memory(GiB)": 21.51, "elapsed_time": "9h 22m 17s", "remaining_time": "2h 14m 37s", "loss_scale": 1.0, "consumed_samples": 1314048, "global_step/max_steps": "5133/6362"} +{"lm loss": 4.8904357, "grad_norm": 0.30881158, "learning_rate": 1.255e-05, "elapsed_time_per_iteration": 6.49749756, "memory(GiB)": 21.51, "elapsed_time": "9h 22m 23s", "remaining_time": "2h 14m 31s", "loss_scale": 1.0, "consumed_samples": 1314304, "global_step/max_steps": "5134/6362"} +{"lm loss": 4.87608194, "grad_norm": 0.32039437, "learning_rate": 1.253e-05, "elapsed_time_per_iteration": 6.52998614, "memory(GiB)": 21.51, "elapsed_time": "9h 22m 30s", "remaining_time": "2h 14m 24s", "loss_scale": 1.0, "consumed_samples": 1314560, "global_step/max_steps": "5135/6362"} +{"lm loss": 4.85787058, "grad_norm": 0.29695827, "learning_rate": 1.252e-05, "elapsed_time_per_iteration": 6.58440375, "memory(GiB)": 21.51, "elapsed_time": "9h 22m 36s", "remaining_time": "2h 14m 17s", "loss_scale": 1.0, "consumed_samples": 1314816, "global_step/max_steps": "5136/6362"} +{"lm loss": 4.86542749, "grad_norm": 0.31252408, "learning_rate": 1.25e-05, "elapsed_time_per_iteration": 6.6119349, "memory(GiB)": 21.51, "elapsed_time": "9h 22m 43s", "remaining_time": "2h 14m 11s", "loss_scale": 1.0, "consumed_samples": 1315072, "global_step/max_steps": "5137/6362"} +{"lm loss": 4.88103294, "grad_norm": 0.32924277, "learning_rate": 1.249e-05, "elapsed_time_per_iteration": 6.51891828, "memory(GiB)": 21.51, "elapsed_time": "9h 22m 49s", "remaining_time": "2h 14m 4s", "loss_scale": 1.0, "consumed_samples": 1315328, "global_step/max_steps": "5138/6362"} +{"lm loss": 4.87177658, "grad_norm": 0.31816927, "learning_rate": 1.247e-05, "elapsed_time_per_iteration": 6.8031373, "memory(GiB)": 21.51, "elapsed_time": "9h 22m 56s", "remaining_time": "2h 13m 58s", "loss_scale": 1.0, "consumed_samples": 1315584, "global_step/max_steps": "5139/6362"} +{"lm loss": 4.88459492, "grad_norm": 0.31597471, "learning_rate": 1.246e-05, "elapsed_time_per_iteration": 6.7265408, "memory(GiB)": 21.51, "elapsed_time": "9h 23m 3s", "remaining_time": "2h 13m 51s", "loss_scale": 1.0, "consumed_samples": 1315840, "global_step/max_steps": "5140/6362"} +{"lm loss": 4.90276337, "grad_norm": 0.33352166, "learning_rate": 1.244e-05, "elapsed_time_per_iteration": 6.52651691, "memory(GiB)": 21.51, "elapsed_time": "9h 23m 9s", "remaining_time": "2h 13m 45s", "loss_scale": 1.0, "consumed_samples": 1316096, "global_step/max_steps": "5141/6362"} +{"lm loss": 4.88169193, "grad_norm": 0.3391099, "learning_rate": 1.243e-05, "elapsed_time_per_iteration": 6.79530358, "memory(GiB)": 21.51, "elapsed_time": "9h 23m 16s", "remaining_time": "2h 13m 38s", "loss_scale": 1.0, "consumed_samples": 1316352, "global_step/max_steps": "5142/6362"} +{"lm loss": 4.90802956, "grad_norm": 0.35216931, "learning_rate": 1.241e-05, "elapsed_time_per_iteration": 6.55911708, "memory(GiB)": 21.51, "elapsed_time": "9h 23m 23s", "remaining_time": "2h 13m 32s", "loss_scale": 1.0, "consumed_samples": 1316608, "global_step/max_steps": "5143/6362"} +{"lm loss": 4.87127638, "grad_norm": 0.32625848, "learning_rate": 1.24e-05, "elapsed_time_per_iteration": 6.52900624, "memory(GiB)": 21.51, "elapsed_time": "9h 23m 29s", "remaining_time": "2h 13m 25s", "loss_scale": 1.0, "consumed_samples": 1316864, "global_step/max_steps": "5144/6362"} +{"lm loss": 4.87041235, "grad_norm": 0.32552734, "learning_rate": 1.238e-05, "elapsed_time_per_iteration": 6.56585979, "memory(GiB)": 21.51, "elapsed_time": "9h 23m 36s", "remaining_time": "2h 13m 18s", "loss_scale": 1.0, "consumed_samples": 1317120, "global_step/max_steps": "5145/6362"} +{"lm loss": 4.86087132, "grad_norm": 0.31168127, "learning_rate": 1.237e-05, "elapsed_time_per_iteration": 6.66329026, "memory(GiB)": 21.51, "elapsed_time": "9h 23m 43s", "remaining_time": "2h 13m 12s", "loss_scale": 1.0, "consumed_samples": 1317376, "global_step/max_steps": "5146/6362"} +{"lm loss": 4.8978796, "grad_norm": 0.32319146, "learning_rate": 1.236e-05, "elapsed_time_per_iteration": 6.60596108, "memory(GiB)": 21.51, "elapsed_time": "9h 23m 49s", "remaining_time": "2h 13m 5s", "loss_scale": 1.0, "consumed_samples": 1317632, "global_step/max_steps": "5147/6362"} +{"lm loss": 4.88038254, "grad_norm": 0.34321356, "learning_rate": 1.234e-05, "elapsed_time_per_iteration": 6.41303945, "memory(GiB)": 21.51, "elapsed_time": "9h 23m 56s", "remaining_time": "2h 12m 59s", "loss_scale": 1.0, "consumed_samples": 1317888, "global_step/max_steps": "5148/6362"} +{"lm loss": 4.87139654, "grad_norm": 0.3122206, "learning_rate": 1.233e-05, "elapsed_time_per_iteration": 6.76907539, "memory(GiB)": 21.51, "elapsed_time": "9h 24m 2s", "remaining_time": "2h 12m 52s", "loss_scale": 1.0, "consumed_samples": 1318144, "global_step/max_steps": "5149/6362"} +{"lm loss": 4.8676362, "grad_norm": 0.32039869, "learning_rate": 1.231e-05, "elapsed_time_per_iteration": 6.46931291, "memory(GiB)": 21.51, "elapsed_time": "9h 24m 9s", "remaining_time": "2h 12m 46s", "loss_scale": 1.0, "consumed_samples": 1318400, "global_step/max_steps": "5150/6362"} +{"lm loss": 4.87113714, "grad_norm": 0.31891361, "learning_rate": 1.23e-05, "elapsed_time_per_iteration": 6.59504604, "memory(GiB)": 21.51, "elapsed_time": "9h 24m 15s", "remaining_time": "2h 12m 39s", "loss_scale": 1.0, "consumed_samples": 1318656, "global_step/max_steps": "5151/6362"} +{"lm loss": 4.89408779, "grad_norm": 0.33292618, "learning_rate": 1.228e-05, "elapsed_time_per_iteration": 6.48515081, "memory(GiB)": 21.51, "elapsed_time": "9h 24m 22s", "remaining_time": "2h 12m 32s", "loss_scale": 1.0, "consumed_samples": 1318912, "global_step/max_steps": "5152/6362"} +{"lm loss": 4.87763023, "grad_norm": 0.30148387, "learning_rate": 1.227e-05, "elapsed_time_per_iteration": 6.57589269, "memory(GiB)": 21.51, "elapsed_time": "9h 24m 29s", "remaining_time": "2h 12m 26s", "loss_scale": 1.0, "consumed_samples": 1319168, "global_step/max_steps": "5153/6362"} +{"lm loss": 4.88420677, "grad_norm": 0.33134204, "learning_rate": 1.225e-05, "elapsed_time_per_iteration": 6.52159524, "memory(GiB)": 21.51, "elapsed_time": "9h 24m 35s", "remaining_time": "2h 12m 19s", "loss_scale": 1.0, "consumed_samples": 1319424, "global_step/max_steps": "5154/6362"} +{"lm loss": 4.87753153, "grad_norm": 0.30825755, "learning_rate": 1.224e-05, "elapsed_time_per_iteration": 6.53834009, "memory(GiB)": 21.51, "elapsed_time": "9h 24m 42s", "remaining_time": "2h 12m 13s", "loss_scale": 1.0, "consumed_samples": 1319680, "global_step/max_steps": "5155/6362"} +{"lm loss": 4.88572216, "grad_norm": 0.32055753, "learning_rate": 1.222e-05, "elapsed_time_per_iteration": 6.63072586, "memory(GiB)": 21.51, "elapsed_time": "9h 24m 48s", "remaining_time": "2h 12m 6s", "loss_scale": 1.0, "consumed_samples": 1319936, "global_step/max_steps": "5156/6362"} +{"lm loss": 4.83689213, "grad_norm": 0.32273474, "learning_rate": 1.221e-05, "elapsed_time_per_iteration": 6.71383166, "memory(GiB)": 21.51, "elapsed_time": "9h 24m 55s", "remaining_time": "2h 12m 0s", "loss_scale": 1.0, "consumed_samples": 1320192, "global_step/max_steps": "5157/6362"} +{"lm loss": 4.87974548, "grad_norm": 0.34467193, "learning_rate": 1.219e-05, "elapsed_time_per_iteration": 6.68344212, "memory(GiB)": 21.51, "elapsed_time": "9h 25m 2s", "remaining_time": "2h 11m 53s", "loss_scale": 1.0, "consumed_samples": 1320448, "global_step/max_steps": "5158/6362"} +{"lm loss": 4.85511446, "grad_norm": 0.3260754, "learning_rate": 1.218e-05, "elapsed_time_per_iteration": 6.45791841, "memory(GiB)": 21.51, "elapsed_time": "9h 25m 8s", "remaining_time": "2h 11m 46s", "loss_scale": 1.0, "consumed_samples": 1320704, "global_step/max_steps": "5159/6362"} +{"lm loss": 4.88482475, "grad_norm": 0.33467203, "learning_rate": 1.216e-05, "elapsed_time_per_iteration": 6.59743428, "memory(GiB)": 21.51, "elapsed_time": "9h 25m 15s", "remaining_time": "2h 11m 40s", "loss_scale": 1.0, "consumed_samples": 1320960, "global_step/max_steps": "5160/6362"} +{"lm loss": 4.87246847, "grad_norm": 0.32555607, "learning_rate": 1.215e-05, "elapsed_time_per_iteration": 6.80384398, "memory(GiB)": 21.51, "elapsed_time": "9h 25m 21s", "remaining_time": "2h 11m 33s", "loss_scale": 1.0, "consumed_samples": 1321216, "global_step/max_steps": "5161/6362"} +{"lm loss": 4.86462069, "grad_norm": 0.34258661, "learning_rate": 1.213e-05, "elapsed_time_per_iteration": 6.50282836, "memory(GiB)": 21.51, "elapsed_time": "9h 25m 28s", "remaining_time": "2h 11m 27s", "loss_scale": 1.0, "consumed_samples": 1321472, "global_step/max_steps": "5162/6362"} +{"lm loss": 4.88785315, "grad_norm": 0.31135002, "learning_rate": 1.212e-05, "elapsed_time_per_iteration": 6.4365828, "memory(GiB)": 21.51, "elapsed_time": "9h 25m 34s", "remaining_time": "2h 11m 20s", "loss_scale": 1.0, "consumed_samples": 1321728, "global_step/max_steps": "5163/6362"} +{"lm loss": 4.85486555, "grad_norm": 0.30694401, "learning_rate": 1.21e-05, "elapsed_time_per_iteration": 6.43710256, "memory(GiB)": 21.51, "elapsed_time": "9h 25m 41s", "remaining_time": "2h 11m 14s", "loss_scale": 1.0, "consumed_samples": 1321984, "global_step/max_steps": "5164/6362"} +{"lm loss": 4.88386202, "grad_norm": 0.33905417, "learning_rate": 1.209e-05, "elapsed_time_per_iteration": 6.62525868, "memory(GiB)": 21.51, "elapsed_time": "9h 25m 47s", "remaining_time": "2h 11m 7s", "loss_scale": 1.0, "consumed_samples": 1322240, "global_step/max_steps": "5165/6362"} +{"lm loss": 4.86504936, "grad_norm": 0.31093654, "learning_rate": 1.207e-05, "elapsed_time_per_iteration": 6.55790329, "memory(GiB)": 21.51, "elapsed_time": "9h 25m 54s", "remaining_time": "2h 11m 0s", "loss_scale": 1.0, "consumed_samples": 1322496, "global_step/max_steps": "5166/6362"} +{"lm loss": 4.89049721, "grad_norm": 0.33912671, "learning_rate": 1.206e-05, "elapsed_time_per_iteration": 6.68674469, "memory(GiB)": 21.51, "elapsed_time": "9h 26m 1s", "remaining_time": "2h 10m 54s", "loss_scale": 1.0, "consumed_samples": 1322752, "global_step/max_steps": "5167/6362"} +{"lm loss": 4.85557795, "grad_norm": 0.30443132, "learning_rate": 1.204e-05, "elapsed_time_per_iteration": 6.7483933, "memory(GiB)": 21.51, "elapsed_time": "9h 26m 7s", "remaining_time": "2h 10m 47s", "loss_scale": 1.0, "consumed_samples": 1323008, "global_step/max_steps": "5168/6362"} +{"lm loss": 4.88701534, "grad_norm": 0.3284182, "learning_rate": 1.203e-05, "elapsed_time_per_iteration": 6.60656786, "memory(GiB)": 21.51, "elapsed_time": "9h 26m 14s", "remaining_time": "2h 10m 41s", "loss_scale": 1.0, "consumed_samples": 1323264, "global_step/max_steps": "5169/6362"} +{"lm loss": 4.87208652, "grad_norm": 0.32728231, "learning_rate": 1.202e-05, "elapsed_time_per_iteration": 6.69330716, "memory(GiB)": 21.51, "elapsed_time": "9h 26m 21s", "remaining_time": "2h 10m 34s", "loss_scale": 1.0, "consumed_samples": 1323520, "global_step/max_steps": "5170/6362"} +{"lm loss": 4.85530329, "grad_norm": 0.32066235, "learning_rate": 1.2e-05, "elapsed_time_per_iteration": 6.49133277, "memory(GiB)": 21.51, "elapsed_time": "9h 26m 27s", "remaining_time": "2h 10m 28s", "loss_scale": 1.0, "consumed_samples": 1323776, "global_step/max_steps": "5171/6362"} +{"lm loss": 4.88267517, "grad_norm": 0.30456692, "learning_rate": 1.199e-05, "elapsed_time_per_iteration": 6.72335982, "memory(GiB)": 21.51, "elapsed_time": "9h 26m 34s", "remaining_time": "2h 10m 21s", "loss_scale": 1.0, "consumed_samples": 1324032, "global_step/max_steps": "5172/6362"} +{"lm loss": 4.88280916, "grad_norm": 0.35256854, "learning_rate": 1.197e-05, "elapsed_time_per_iteration": 6.67200923, "memory(GiB)": 21.51, "elapsed_time": "9h 26m 41s", "remaining_time": "2h 10m 15s", "loss_scale": 1.0, "consumed_samples": 1324288, "global_step/max_steps": "5173/6362"} +{"lm loss": 4.85920811, "grad_norm": 0.31058744, "learning_rate": 1.196e-05, "elapsed_time_per_iteration": 6.75846267, "memory(GiB)": 21.51, "elapsed_time": "9h 26m 47s", "remaining_time": "2h 10m 8s", "loss_scale": 1.0, "consumed_samples": 1324544, "global_step/max_steps": "5174/6362"} +{"lm loss": 4.86041355, "grad_norm": 0.3747811, "learning_rate": 1.194e-05, "elapsed_time_per_iteration": 6.5098958, "memory(GiB)": 21.51, "elapsed_time": "9h 26m 54s", "remaining_time": "2h 10m 1s", "loss_scale": 1.0, "consumed_samples": 1324800, "global_step/max_steps": "5175/6362"} +{"lm loss": 4.8657732, "grad_norm": 0.3430725, "learning_rate": 1.193e-05, "elapsed_time_per_iteration": 6.41904187, "memory(GiB)": 21.51, "elapsed_time": "9h 27m 0s", "remaining_time": "2h 9m 55s", "loss_scale": 1.0, "consumed_samples": 1325056, "global_step/max_steps": "5176/6362"} +{"lm loss": 4.88552761, "grad_norm": 0.3007161, "learning_rate": 1.191e-05, "elapsed_time_per_iteration": 6.53716207, "memory(GiB)": 21.51, "elapsed_time": "9h 27m 7s", "remaining_time": "2h 9m 48s", "loss_scale": 1.0, "consumed_samples": 1325312, "global_step/max_steps": "5177/6362"} +{"lm loss": 4.89805603, "grad_norm": 0.31853569, "learning_rate": 1.19e-05, "elapsed_time_per_iteration": 6.58593631, "memory(GiB)": 21.51, "elapsed_time": "9h 27m 13s", "remaining_time": "2h 9m 42s", "loss_scale": 1.0, "consumed_samples": 1325568, "global_step/max_steps": "5178/6362"} +{"lm loss": 4.86241722, "grad_norm": 0.30502415, "learning_rate": 1.188e-05, "elapsed_time_per_iteration": 6.64237261, "memory(GiB)": 21.51, "elapsed_time": "9h 27m 20s", "remaining_time": "2h 9m 35s", "loss_scale": 1.0, "consumed_samples": 1325824, "global_step/max_steps": "5179/6362"} +{"lm loss": 4.89711761, "grad_norm": 0.31959286, "learning_rate": 1.187e-05, "elapsed_time_per_iteration": 6.61033988, "memory(GiB)": 21.51, "elapsed_time": "9h 27m 27s", "remaining_time": "2h 9m 29s", "loss_scale": 1.0, "consumed_samples": 1326080, "global_step/max_steps": "5180/6362"} +{"lm loss": 4.87343597, "grad_norm": 0.3129752, "learning_rate": 1.186e-05, "elapsed_time_per_iteration": 6.75467777, "memory(GiB)": 21.51, "elapsed_time": "9h 27m 33s", "remaining_time": "2h 9m 22s", "loss_scale": 1.0, "consumed_samples": 1326336, "global_step/max_steps": "5181/6362"} +{"lm loss": 4.87535954, "grad_norm": 0.31396189, "learning_rate": 1.184e-05, "elapsed_time_per_iteration": 6.5354054, "memory(GiB)": 21.51, "elapsed_time": "9h 27m 40s", "remaining_time": "2h 9m 15s", "loss_scale": 1.0, "consumed_samples": 1326592, "global_step/max_steps": "5182/6362"} +{"lm loss": 4.87936735, "grad_norm": 0.32929355, "learning_rate": 1.183e-05, "elapsed_time_per_iteration": 6.6351099, "memory(GiB)": 21.51, "elapsed_time": "9h 27m 47s", "remaining_time": "2h 9m 9s", "loss_scale": 1.0, "consumed_samples": 1326848, "global_step/max_steps": "5183/6362"} +{"lm loss": 4.85421991, "grad_norm": 0.31700733, "learning_rate": 1.181e-05, "elapsed_time_per_iteration": 6.53898478, "memory(GiB)": 21.51, "elapsed_time": "9h 27m 53s", "remaining_time": "2h 9m 2s", "loss_scale": 1.0, "consumed_samples": 1327104, "global_step/max_steps": "5184/6362"} +{"lm loss": 4.88502026, "grad_norm": 0.28900516, "learning_rate": 1.18e-05, "elapsed_time_per_iteration": 6.52300739, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 0s", "remaining_time": "2h 8m 56s", "loss_scale": 1.0, "consumed_samples": 1327360, "global_step/max_steps": "5185/6362"} +{"lm loss": 4.88753748, "grad_norm": 0.31299943, "learning_rate": 1.178e-05, "elapsed_time_per_iteration": 6.58614802, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 6s", "remaining_time": "2h 8m 49s", "loss_scale": 1.0, "consumed_samples": 1327616, "global_step/max_steps": "5186/6362"} +{"lm loss": 4.8477807, "grad_norm": 0.32772437, "learning_rate": 1.177e-05, "elapsed_time_per_iteration": 6.69587946, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 13s", "remaining_time": "2h 8m 43s", "loss_scale": 1.0, "consumed_samples": 1327872, "global_step/max_steps": "5187/6362"} +{"lm loss": 4.88186884, "grad_norm": 0.3052986, "learning_rate": 1.175e-05, "elapsed_time_per_iteration": 6.74155164, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 20s", "remaining_time": "2h 8m 36s", "loss_scale": 1.0, "consumed_samples": 1328128, "global_step/max_steps": "5188/6362"} +{"lm loss": 4.89497232, "grad_norm": 0.31831515, "learning_rate": 1.174e-05, "elapsed_time_per_iteration": 6.69443727, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 26s", "remaining_time": "2h 8m 30s", "loss_scale": 1.0, "consumed_samples": 1328384, "global_step/max_steps": "5189/6362"} +{"lm loss": 4.88386154, "grad_norm": 0.31357273, "learning_rate": 1.172e-05, "elapsed_time_per_iteration": 6.58437586, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 33s", "remaining_time": "2h 8m 23s", "loss_scale": 1.0, "consumed_samples": 1328640, "global_step/max_steps": "5190/6362"} +{"lm loss": 4.88397694, "grad_norm": 0.30813131, "learning_rate": 1.171e-05, "elapsed_time_per_iteration": 6.44799232, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 39s", "remaining_time": "2h 8m 16s", "loss_scale": 1.0, "consumed_samples": 1328896, "global_step/max_steps": "5191/6362"} +{"lm loss": 4.85466003, "grad_norm": 0.29087335, "learning_rate": 1.17e-05, "elapsed_time_per_iteration": 6.38278818, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 46s", "remaining_time": "2h 8m 10s", "loss_scale": 1.0, "consumed_samples": 1329152, "global_step/max_steps": "5192/6362"} +{"lm loss": 4.87400818, "grad_norm": 0.30098423, "learning_rate": 1.168e-05, "elapsed_time_per_iteration": 6.67329812, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 53s", "remaining_time": "2h 8m 3s", "loss_scale": 1.0, "consumed_samples": 1329408, "global_step/max_steps": "5193/6362"} +{"lm loss": 4.86988068, "grad_norm": 0.31504372, "learning_rate": 1.167e-05, "elapsed_time_per_iteration": 6.81170964, "memory(GiB)": 21.51, "elapsed_time": "9h 28m 59s", "remaining_time": "2h 7m 57s", "loss_scale": 1.0, "consumed_samples": 1329664, "global_step/max_steps": "5194/6362"} +{"lm loss": 4.87970066, "grad_norm": 0.28424966, "learning_rate": 1.165e-05, "elapsed_time_per_iteration": 6.51618671, "memory(GiB)": 21.51, "elapsed_time": "9h 29m 6s", "remaining_time": "2h 7m 50s", "loss_scale": 1.0, "consumed_samples": 1329920, "global_step/max_steps": "5195/6362"} +{"lm loss": 4.89138222, "grad_norm": 0.29033852, "learning_rate": 1.164e-05, "elapsed_time_per_iteration": 6.49157572, "memory(GiB)": 21.51, "elapsed_time": "9h 29m 12s", "remaining_time": "2h 7m 44s", "loss_scale": 1.0, "consumed_samples": 1330176, "global_step/max_steps": "5196/6362"} +{"lm loss": 4.85606909, "grad_norm": 0.31216487, "learning_rate": 1.162e-05, "elapsed_time_per_iteration": 6.39291549, "memory(GiB)": 21.51, "elapsed_time": "9h 29m 19s", "remaining_time": "2h 7m 37s", "loss_scale": 1.0, "consumed_samples": 1330432, "global_step/max_steps": "5197/6362"} +{"lm loss": 4.89698219, "grad_norm": 0.31133854, "learning_rate": 1.161e-05, "elapsed_time_per_iteration": 6.51255751, "memory(GiB)": 21.51, "elapsed_time": "9h 29m 25s", "remaining_time": "2h 7m 30s", "loss_scale": 1.0, "consumed_samples": 1330688, "global_step/max_steps": "5198/6362"} +{"lm loss": 4.86971903, "grad_norm": 0.32814708, "learning_rate": 1.16e-05, "elapsed_time_per_iteration": 6.46252489, "memory(GiB)": 21.51, "elapsed_time": "9h 29m 32s", "remaining_time": "2h 7m 24s", "loss_scale": 1.0, "consumed_samples": 1330944, "global_step/max_steps": "5199/6362"} +{"lm loss": 4.86132574, "grad_norm": 0.31756386, "learning_rate": 1.158e-05, "elapsed_time_per_iteration": 6.40736628, "memory(GiB)": 21.51, "elapsed_time": "9h 29m 38s", "remaining_time": "2h 7m 17s", "loss_scale": 1.0, "consumed_samples": 1331200, "global_step/max_steps": "5200/6362"} +{"lm loss": 4.88635302, "grad_norm": 0.31573573, "learning_rate": 1.157e-05, "elapsed_time_per_iteration": 6.45898581, "memory(GiB)": 21.51, "elapsed_time": "9h 29m 45s", "remaining_time": "2h 7m 11s", "loss_scale": 1.0, "consumed_samples": 1331456, "global_step/max_steps": "5201/6362"} +{"lm loss": 4.89685249, "grad_norm": 0.29605919, "learning_rate": 1.155e-05, "elapsed_time_per_iteration": 6.50970054, "memory(GiB)": 21.51, "elapsed_time": "9h 29m 51s", "remaining_time": "2h 7m 4s", "loss_scale": 1.0, "consumed_samples": 1331712, "global_step/max_steps": "5202/6362"} +{"lm loss": 4.8967638, "grad_norm": 0.32634562, "learning_rate": 1.154e-05, "elapsed_time_per_iteration": 6.75113249, "memory(GiB)": 21.51, "elapsed_time": "9h 29m 58s", "remaining_time": "2h 6m 57s", "loss_scale": 1.0, "consumed_samples": 1331968, "global_step/max_steps": "5203/6362"} +{"lm loss": 4.8659091, "grad_norm": 0.31571639, "learning_rate": 1.152e-05, "elapsed_time_per_iteration": 6.55843711, "memory(GiB)": 21.51, "elapsed_time": "9h 30m 4s", "remaining_time": "2h 6m 51s", "loss_scale": 1.0, "consumed_samples": 1332224, "global_step/max_steps": "5204/6362"} +{"lm loss": 4.86851835, "grad_norm": 0.32393241, "learning_rate": 1.151e-05, "elapsed_time_per_iteration": 6.65188241, "memory(GiB)": 21.51, "elapsed_time": "9h 30m 11s", "remaining_time": "2h 6m 44s", "loss_scale": 1.0, "consumed_samples": 1332480, "global_step/max_steps": "5205/6362"} +{"lm loss": 4.86723423, "grad_norm": 0.28009158, "learning_rate": 1.15e-05, "elapsed_time_per_iteration": 6.49104023, "memory(GiB)": 21.51, "elapsed_time": "9h 30m 18s", "remaining_time": "2h 6m 38s", "loss_scale": 1.0, "consumed_samples": 1332736, "global_step/max_steps": "5206/6362"} +{"lm loss": 4.90138149, "grad_norm": 0.30048314, "learning_rate": 1.148e-05, "elapsed_time_per_iteration": 6.65376163, "memory(GiB)": 21.51, "elapsed_time": "9h 30m 24s", "remaining_time": "2h 6m 31s", "loss_scale": 1.0, "consumed_samples": 1332992, "global_step/max_steps": "5207/6362"} +{"lm loss": 4.84528828, "grad_norm": 0.32010692, "learning_rate": 1.147e-05, "elapsed_time_per_iteration": 6.66490364, "memory(GiB)": 21.51, "elapsed_time": "9h 30m 31s", "remaining_time": "2h 6m 25s", "loss_scale": 1.0, "consumed_samples": 1333248, "global_step/max_steps": "5208/6362"} +{"lm loss": 4.86172342, "grad_norm": 0.30463624, "learning_rate": 1.145e-05, "elapsed_time_per_iteration": 6.51531792, "memory(GiB)": 21.51, "elapsed_time": "9h 30m 37s", "remaining_time": "2h 6m 18s", "loss_scale": 1.0, "consumed_samples": 1333504, "global_step/max_steps": "5209/6362"} +{"lm loss": 4.8642168, "grad_norm": 0.3363649, "learning_rate": 1.144e-05, "elapsed_time_per_iteration": 6.67357516, "memory(GiB)": 21.51, "elapsed_time": "9h 30m 44s", "remaining_time": "2h 6m 11s", "loss_scale": 1.0, "consumed_samples": 1333760, "global_step/max_steps": "5210/6362"} +{"lm loss": 4.85980797, "grad_norm": 0.31881753, "learning_rate": 1.142e-05, "elapsed_time_per_iteration": 6.41173935, "memory(GiB)": 21.51, "elapsed_time": "9h 30m 50s", "remaining_time": "2h 6m 5s", "loss_scale": 1.0, "consumed_samples": 1334016, "global_step/max_steps": "5211/6362"} +{"lm loss": 4.85660028, "grad_norm": 0.29373693, "learning_rate": 1.141e-05, "elapsed_time_per_iteration": 6.6009233, "memory(GiB)": 21.51, "elapsed_time": "9h 30m 57s", "remaining_time": "2h 5m 58s", "loss_scale": 1.0, "consumed_samples": 1334272, "global_step/max_steps": "5212/6362"} +{"lm loss": 4.86466217, "grad_norm": 0.33361655, "learning_rate": 1.14e-05, "elapsed_time_per_iteration": 6.39968801, "memory(GiB)": 21.51, "elapsed_time": "9h 31m 3s", "remaining_time": "2h 5m 52s", "loss_scale": 1.0, "consumed_samples": 1334528, "global_step/max_steps": "5213/6362"} +{"lm loss": 4.87757015, "grad_norm": 0.3114312, "learning_rate": 1.138e-05, "elapsed_time_per_iteration": 6.45645404, "memory(GiB)": 21.51, "elapsed_time": "9h 31m 10s", "remaining_time": "2h 5m 45s", "loss_scale": 1.0, "consumed_samples": 1334784, "global_step/max_steps": "5214/6362"} +{"lm loss": 4.86401653, "grad_norm": 0.31105602, "learning_rate": 1.137e-05, "elapsed_time_per_iteration": 6.50880504, "memory(GiB)": 21.51, "elapsed_time": "9h 31m 16s", "remaining_time": "2h 5m 38s", "loss_scale": 1.0, "consumed_samples": 1335040, "global_step/max_steps": "5215/6362"} +{"lm loss": 4.8859539, "grad_norm": 0.32277676, "learning_rate": 1.135e-05, "elapsed_time_per_iteration": 6.44363141, "memory(GiB)": 21.51, "elapsed_time": "9h 31m 23s", "remaining_time": "2h 5m 32s", "loss_scale": 1.0, "consumed_samples": 1335296, "global_step/max_steps": "5216/6362"} +{"lm loss": 4.87916231, "grad_norm": 0.32176685, "learning_rate": 1.134e-05, "elapsed_time_per_iteration": 6.62649107, "memory(GiB)": 21.51, "elapsed_time": "9h 31m 29s", "remaining_time": "2h 5m 25s", "loss_scale": 1.0, "consumed_samples": 1335552, "global_step/max_steps": "5217/6362"} +{"lm loss": 4.89610386, "grad_norm": 0.30915508, "learning_rate": 1.133e-05, "elapsed_time_per_iteration": 6.42188907, "memory(GiB)": 21.51, "elapsed_time": "9h 31m 36s", "remaining_time": "2h 5m 19s", "loss_scale": 1.0, "consumed_samples": 1335808, "global_step/max_steps": "5218/6362"} +{"lm loss": 4.87016535, "grad_norm": 0.32509866, "learning_rate": 1.131e-05, "elapsed_time_per_iteration": 6.61632061, "memory(GiB)": 21.51, "elapsed_time": "9h 31m 43s", "remaining_time": "2h 5m 12s", "loss_scale": 1.0, "consumed_samples": 1336064, "global_step/max_steps": "5219/6362"} +{"lm loss": 4.87653589, "grad_norm": 0.32098678, "learning_rate": 1.13e-05, "elapsed_time_per_iteration": 6.53199482, "memory(GiB)": 21.51, "elapsed_time": "9h 31m 49s", "remaining_time": "2h 5m 6s", "loss_scale": 1.0, "consumed_samples": 1336320, "global_step/max_steps": "5220/6362"} +{"lm loss": 4.89116716, "grad_norm": 0.3241165, "learning_rate": 1.128e-05, "elapsed_time_per_iteration": 6.49248481, "memory(GiB)": 21.51, "elapsed_time": "9h 31m 56s", "remaining_time": "2h 4m 59s", "loss_scale": 1.0, "consumed_samples": 1336576, "global_step/max_steps": "5221/6362"} +{"lm loss": 4.88739681, "grad_norm": 0.31789967, "learning_rate": 1.127e-05, "elapsed_time_per_iteration": 6.58503151, "memory(GiB)": 21.51, "elapsed_time": "9h 32m 2s", "remaining_time": "2h 4m 52s", "loss_scale": 1.0, "consumed_samples": 1336832, "global_step/max_steps": "5222/6362"} +{"lm loss": 4.88356829, "grad_norm": 0.30395776, "learning_rate": 1.125e-05, "elapsed_time_per_iteration": 6.59710026, "memory(GiB)": 21.51, "elapsed_time": "9h 32m 9s", "remaining_time": "2h 4m 46s", "loss_scale": 1.0, "consumed_samples": 1337088, "global_step/max_steps": "5223/6362"} +{"lm loss": 4.86987257, "grad_norm": 0.30375791, "learning_rate": 1.124e-05, "elapsed_time_per_iteration": 6.44552875, "memory(GiB)": 21.51, "elapsed_time": "9h 32m 15s", "remaining_time": "2h 4m 39s", "loss_scale": 1.0, "consumed_samples": 1337344, "global_step/max_steps": "5224/6362"} +{"lm loss": 4.8722415, "grad_norm": 0.31053782, "learning_rate": 1.123e-05, "elapsed_time_per_iteration": 6.45383382, "memory(GiB)": 21.51, "elapsed_time": "9h 32m 22s", "remaining_time": "2h 4m 33s", "loss_scale": 1.0, "consumed_samples": 1337600, "global_step/max_steps": "5225/6362"} +{"lm loss": 4.88067818, "grad_norm": 0.30579671, "learning_rate": 1.121e-05, "elapsed_time_per_iteration": 6.46109366, "memory(GiB)": 21.51, "elapsed_time": "9h 32m 28s", "remaining_time": "2h 4m 26s", "loss_scale": 1.0, "consumed_samples": 1337856, "global_step/max_steps": "5226/6362"} +{"lm loss": 4.86650467, "grad_norm": 0.2950891, "learning_rate": 1.12e-05, "elapsed_time_per_iteration": 6.7299459, "memory(GiB)": 21.51, "elapsed_time": "9h 32m 35s", "remaining_time": "2h 4m 19s", "loss_scale": 1.0, "consumed_samples": 1338112, "global_step/max_steps": "5227/6362"} +{"lm loss": 4.85581207, "grad_norm": 0.30793363, "learning_rate": 1.118e-05, "elapsed_time_per_iteration": 6.44833493, "memory(GiB)": 21.51, "elapsed_time": "9h 32m 41s", "remaining_time": "2h 4m 13s", "loss_scale": 1.0, "consumed_samples": 1338368, "global_step/max_steps": "5228/6362"} +{"lm loss": 4.86602545, "grad_norm": 0.29250297, "learning_rate": 1.117e-05, "elapsed_time_per_iteration": 6.62495136, "memory(GiB)": 21.51, "elapsed_time": "9h 32m 48s", "remaining_time": "2h 4m 6s", "loss_scale": 1.0, "consumed_samples": 1338624, "global_step/max_steps": "5229/6362"} +{"lm loss": 4.87326384, "grad_norm": 0.29853573, "learning_rate": 1.116e-05, "elapsed_time_per_iteration": 6.39153838, "memory(GiB)": 21.51, "elapsed_time": "9h 32m 54s", "remaining_time": "2h 4m 0s", "loss_scale": 1.0, "consumed_samples": 1338880, "global_step/max_steps": "5230/6362"} +{"lm loss": 4.87496281, "grad_norm": 0.32630199, "learning_rate": 1.114e-05, "elapsed_time_per_iteration": 6.79108524, "memory(GiB)": 21.51, "elapsed_time": "9h 33m 1s", "remaining_time": "2h 3m 53s", "loss_scale": 1.0, "consumed_samples": 1339136, "global_step/max_steps": "5231/6362"} +{"lm loss": 4.8621769, "grad_norm": 0.29668504, "learning_rate": 1.113e-05, "elapsed_time_per_iteration": 6.58746052, "memory(GiB)": 21.51, "elapsed_time": "9h 33m 8s", "remaining_time": "2h 3m 47s", "loss_scale": 1.0, "consumed_samples": 1339392, "global_step/max_steps": "5232/6362"} +{"lm loss": 4.88770771, "grad_norm": 0.32083592, "learning_rate": 1.111e-05, "elapsed_time_per_iteration": 6.57735562, "memory(GiB)": 21.51, "elapsed_time": "9h 33m 14s", "remaining_time": "2h 3m 40s", "loss_scale": 1.0, "consumed_samples": 1339648, "global_step/max_steps": "5233/6362"} +{"lm loss": 4.84692287, "grad_norm": 0.32264933, "learning_rate": 1.11e-05, "elapsed_time_per_iteration": 6.56659031, "memory(GiB)": 21.51, "elapsed_time": "9h 33m 21s", "remaining_time": "2h 3m 33s", "loss_scale": 1.0, "consumed_samples": 1339904, "global_step/max_steps": "5234/6362"} +{"lm loss": 4.87693167, "grad_norm": 0.33627984, "learning_rate": 1.109e-05, "elapsed_time_per_iteration": 6.76201701, "memory(GiB)": 21.51, "elapsed_time": "9h 33m 28s", "remaining_time": "2h 3m 27s", "loss_scale": 1.0, "consumed_samples": 1340160, "global_step/max_steps": "5235/6362"} +{"lm loss": 4.87374163, "grad_norm": 0.30334994, "learning_rate": 1.107e-05, "elapsed_time_per_iteration": 6.78786922, "memory(GiB)": 21.51, "elapsed_time": "9h 33m 34s", "remaining_time": "2h 3m 20s", "loss_scale": 1.0, "consumed_samples": 1340416, "global_step/max_steps": "5236/6362"} +{"lm loss": 4.89874697, "grad_norm": 0.34787658, "learning_rate": 1.106e-05, "elapsed_time_per_iteration": 6.48632455, "memory(GiB)": 21.51, "elapsed_time": "9h 33m 41s", "remaining_time": "2h 3m 14s", "loss_scale": 1.0, "consumed_samples": 1340672, "global_step/max_steps": "5237/6362"} +{"lm loss": 4.86653566, "grad_norm": 0.30899134, "learning_rate": 1.104e-05, "elapsed_time_per_iteration": 6.39108276, "memory(GiB)": 21.51, "elapsed_time": "9h 33m 47s", "remaining_time": "2h 3m 7s", "loss_scale": 1.0, "consumed_samples": 1340928, "global_step/max_steps": "5238/6362"} +{"lm loss": 4.88145685, "grad_norm": 0.34994355, "learning_rate": 1.103e-05, "elapsed_time_per_iteration": 6.75515366, "memory(GiB)": 21.51, "elapsed_time": "9h 33m 54s", "remaining_time": "2h 3m 1s", "loss_scale": 1.0, "consumed_samples": 1341184, "global_step/max_steps": "5239/6362"} +{"lm loss": 4.85459423, "grad_norm": 0.31096947, "learning_rate": 1.102e-05, "elapsed_time_per_iteration": 6.67270494, "memory(GiB)": 21.51, "elapsed_time": "9h 34m 1s", "remaining_time": "2h 2m 54s", "loss_scale": 1.0, "consumed_samples": 1341440, "global_step/max_steps": "5240/6362"} +{"lm loss": 4.86324549, "grad_norm": 0.31972575, "learning_rate": 1.1e-05, "elapsed_time_per_iteration": 6.54301214, "memory(GiB)": 21.51, "elapsed_time": "9h 34m 7s", "remaining_time": "2h 2m 48s", "loss_scale": 1.0, "consumed_samples": 1341696, "global_step/max_steps": "5241/6362"} +{"lm loss": 4.86827993, "grad_norm": 0.34154803, "learning_rate": 1.099e-05, "elapsed_time_per_iteration": 6.91800642, "memory(GiB)": 21.51, "elapsed_time": "9h 34m 14s", "remaining_time": "2h 2m 41s", "loss_scale": 1.0, "consumed_samples": 1341952, "global_step/max_steps": "5242/6362"} +{"lm loss": 4.88618612, "grad_norm": 0.32814538, "learning_rate": 1.098e-05, "elapsed_time_per_iteration": 6.67203927, "memory(GiB)": 21.51, "elapsed_time": "9h 34m 21s", "remaining_time": "2h 2m 34s", "loss_scale": 1.0, "consumed_samples": 1342208, "global_step/max_steps": "5243/6362"} +{"lm loss": 4.86682987, "grad_norm": 0.29083398, "learning_rate": 1.096e-05, "elapsed_time_per_iteration": 6.54272842, "memory(GiB)": 21.51, "elapsed_time": "9h 34m 27s", "remaining_time": "2h 2m 28s", "loss_scale": 1.0, "consumed_samples": 1342464, "global_step/max_steps": "5244/6362"} +{"lm loss": 4.89402246, "grad_norm": 0.32361561, "learning_rate": 1.095e-05, "elapsed_time_per_iteration": 6.49221373, "memory(GiB)": 21.51, "elapsed_time": "9h 34m 34s", "remaining_time": "2h 2m 21s", "loss_scale": 1.0, "consumed_samples": 1342720, "global_step/max_steps": "5245/6362"} +{"lm loss": 4.8924489, "grad_norm": 0.31681284, "learning_rate": 1.093e-05, "elapsed_time_per_iteration": 6.44392776, "memory(GiB)": 21.51, "elapsed_time": "9h 34m 40s", "remaining_time": "2h 2m 15s", "loss_scale": 1.0, "consumed_samples": 1342976, "global_step/max_steps": "5246/6362"} +{"lm loss": 4.88146257, "grad_norm": 0.33049721, "learning_rate": 1.092e-05, "elapsed_time_per_iteration": 6.75039768, "memory(GiB)": 21.51, "elapsed_time": "9h 34m 47s", "remaining_time": "2h 2m 8s", "loss_scale": 1.0, "consumed_samples": 1343232, "global_step/max_steps": "5247/6362"} +{"lm loss": 4.88503456, "grad_norm": 0.31976047, "learning_rate": 1.091e-05, "elapsed_time_per_iteration": 6.6888659, "memory(GiB)": 21.51, "elapsed_time": "9h 34m 54s", "remaining_time": "2h 2m 2s", "loss_scale": 1.0, "consumed_samples": 1343488, "global_step/max_steps": "5248/6362"} +{"lm loss": 4.865695, "grad_norm": 0.29814145, "learning_rate": 1.089e-05, "elapsed_time_per_iteration": 6.62249184, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 0s", "remaining_time": "2h 1m 55s", "loss_scale": 1.0, "consumed_samples": 1343744, "global_step/max_steps": "5249/6362"} +{"lm loss": 4.86261845, "grad_norm": 0.31179374, "learning_rate": 1.088e-05, "elapsed_time_per_iteration": 6.54069233, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 7s", "remaining_time": "2h 1m 48s", "loss_scale": 1.0, "consumed_samples": 1344000, "global_step/max_steps": "5250/6362"} +{"lm loss": 4.85127544, "grad_norm": 0.31096974, "learning_rate": 1.087e-05, "elapsed_time_per_iteration": 6.46429396, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 13s", "remaining_time": "2h 1m 42s", "loss_scale": 1.0, "consumed_samples": 1344256, "global_step/max_steps": "5251/6362"} +{"lm loss": 4.86364269, "grad_norm": 0.32767341, "learning_rate": 1.085e-05, "elapsed_time_per_iteration": 6.58707976, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 20s", "remaining_time": "2h 1m 35s", "loss_scale": 1.0, "consumed_samples": 1344512, "global_step/max_steps": "5252/6362"} +{"lm loss": 4.87166882, "grad_norm": 0.30636355, "learning_rate": 1.084e-05, "elapsed_time_per_iteration": 6.59422588, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 27s", "remaining_time": "2h 1m 29s", "loss_scale": 1.0, "consumed_samples": 1344768, "global_step/max_steps": "5253/6362"} +{"lm loss": 4.88387871, "grad_norm": 0.32459828, "learning_rate": 1.082e-05, "elapsed_time_per_iteration": 6.9034605, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 33s", "remaining_time": "2h 1m 22s", "loss_scale": 1.0, "consumed_samples": 1345024, "global_step/max_steps": "5254/6362"} +{"lm loss": 4.87874365, "grad_norm": 0.30821776, "learning_rate": 1.081e-05, "elapsed_time_per_iteration": 6.51969862, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 40s", "remaining_time": "2h 1m 16s", "loss_scale": 1.0, "consumed_samples": 1345280, "global_step/max_steps": "5255/6362"} +{"lm loss": 4.89242172, "grad_norm": 0.3163465, "learning_rate": 1.08e-05, "elapsed_time_per_iteration": 6.51304841, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 46s", "remaining_time": "2h 1m 9s", "loss_scale": 1.0, "consumed_samples": 1345536, "global_step/max_steps": "5256/6362"} +{"lm loss": 4.83780336, "grad_norm": 0.3159034, "learning_rate": 1.078e-05, "elapsed_time_per_iteration": 6.44747853, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 53s", "remaining_time": "2h 1m 2s", "loss_scale": 1.0, "consumed_samples": 1345792, "global_step/max_steps": "5257/6362"} +{"lm loss": 4.90259218, "grad_norm": 0.29391479, "learning_rate": 1.077e-05, "elapsed_time_per_iteration": 6.47083545, "memory(GiB)": 21.51, "elapsed_time": "9h 35m 59s", "remaining_time": "2h 0m 56s", "loss_scale": 1.0, "consumed_samples": 1346048, "global_step/max_steps": "5258/6362"} +{"lm loss": 4.89131641, "grad_norm": 0.32688814, "learning_rate": 1.076e-05, "elapsed_time_per_iteration": 6.52346134, "memory(GiB)": 21.51, "elapsed_time": "9h 36m 6s", "remaining_time": "2h 0m 49s", "loss_scale": 1.0, "consumed_samples": 1346304, "global_step/max_steps": "5259/6362"} +{"lm loss": 4.87412548, "grad_norm": 0.28944629, "learning_rate": 1.074e-05, "elapsed_time_per_iteration": 6.3125267, "memory(GiB)": 21.51, "elapsed_time": "9h 36m 12s", "remaining_time": "2h 0m 43s", "loss_scale": 1.0, "consumed_samples": 1346560, "global_step/max_steps": "5260/6362"} +{"lm loss": 4.86711311, "grad_norm": 0.29230982, "learning_rate": 1.073e-05, "elapsed_time_per_iteration": 6.58822465, "memory(GiB)": 21.51, "elapsed_time": "9h 36m 19s", "remaining_time": "2h 0m 36s", "loss_scale": 1.0, "consumed_samples": 1346816, "global_step/max_steps": "5261/6362"} +{"lm loss": 4.86491156, "grad_norm": 0.29971069, "learning_rate": 1.071e-05, "elapsed_time_per_iteration": 6.35798264, "memory(GiB)": 21.51, "elapsed_time": "9h 36m 25s", "remaining_time": "2h 0m 29s", "loss_scale": 1.0, "consumed_samples": 1347072, "global_step/max_steps": "5262/6362"} +{"lm loss": 4.87321806, "grad_norm": 0.31096005, "learning_rate": 1.07e-05, "elapsed_time_per_iteration": 6.84398913, "memory(GiB)": 21.51, "elapsed_time": "9h 36m 32s", "remaining_time": "2h 0m 23s", "loss_scale": 1.0, "consumed_samples": 1347328, "global_step/max_steps": "5263/6362"} +{"lm loss": 4.86227369, "grad_norm": 0.32411388, "learning_rate": 1.069e-05, "elapsed_time_per_iteration": 6.60405374, "memory(GiB)": 21.51, "elapsed_time": "9h 36m 39s", "remaining_time": "2h 0m 16s", "loss_scale": 1.0, "consumed_samples": 1347584, "global_step/max_steps": "5264/6362"} +{"lm loss": 4.86961269, "grad_norm": 0.30026904, "learning_rate": 1.067e-05, "elapsed_time_per_iteration": 6.72713566, "memory(GiB)": 21.51, "elapsed_time": "9h 36m 45s", "remaining_time": "2h 0m 10s", "loss_scale": 1.0, "consumed_samples": 1347840, "global_step/max_steps": "5265/6362"} +{"lm loss": 4.86572838, "grad_norm": 0.32660219, "learning_rate": 1.066e-05, "elapsed_time_per_iteration": 6.49747086, "memory(GiB)": 21.51, "elapsed_time": "9h 36m 52s", "remaining_time": "2h 0m 3s", "loss_scale": 1.0, "consumed_samples": 1348096, "global_step/max_steps": "5266/6362"} +{"lm loss": 4.88106155, "grad_norm": 0.2979297, "learning_rate": 1.065e-05, "elapsed_time_per_iteration": 6.56816363, "memory(GiB)": 21.51, "elapsed_time": "9h 36m 58s", "remaining_time": "1h 59m 57s", "loss_scale": 1.0, "consumed_samples": 1348352, "global_step/max_steps": "5267/6362"} +{"lm loss": 4.87215853, "grad_norm": 0.31103694, "learning_rate": 1.063e-05, "elapsed_time_per_iteration": 6.25572443, "memory(GiB)": 21.51, "elapsed_time": "9h 37m 5s", "remaining_time": "1h 59m 50s", "loss_scale": 1.0, "consumed_samples": 1348608, "global_step/max_steps": "5268/6362"} +{"lm loss": 4.88415146, "grad_norm": 0.30339774, "learning_rate": 1.062e-05, "elapsed_time_per_iteration": 6.6857326, "memory(GiB)": 21.51, "elapsed_time": "9h 37m 11s", "remaining_time": "1h 59m 44s", "loss_scale": 1.0, "consumed_samples": 1348864, "global_step/max_steps": "5269/6362"} +{"lm loss": 4.87117386, "grad_norm": 0.31679666, "learning_rate": 1.061e-05, "elapsed_time_per_iteration": 6.47403574, "memory(GiB)": 21.51, "elapsed_time": "9h 37m 18s", "remaining_time": "1h 59m 37s", "loss_scale": 1.0, "consumed_samples": 1349120, "global_step/max_steps": "5270/6362"} +{"lm loss": 4.87290907, "grad_norm": 0.29276949, "learning_rate": 1.059e-05, "elapsed_time_per_iteration": 6.69630885, "memory(GiB)": 21.51, "elapsed_time": "9h 37m 24s", "remaining_time": "1h 59m 30s", "loss_scale": 1.0, "consumed_samples": 1349376, "global_step/max_steps": "5271/6362"} +{"lm loss": 4.8524313, "grad_norm": 0.28919759, "learning_rate": 1.058e-05, "elapsed_time_per_iteration": 6.48853827, "memory(GiB)": 21.51, "elapsed_time": "9h 37m 31s", "remaining_time": "1h 59m 24s", "loss_scale": 1.0, "consumed_samples": 1349632, "global_step/max_steps": "5272/6362"} +{"lm loss": 4.87315416, "grad_norm": 0.29832372, "learning_rate": 1.056e-05, "elapsed_time_per_iteration": 6.3593297, "memory(GiB)": 21.51, "elapsed_time": "9h 37m 37s", "remaining_time": "1h 59m 17s", "loss_scale": 1.0, "consumed_samples": 1349888, "global_step/max_steps": "5273/6362"} +{"lm loss": 4.86115646, "grad_norm": 0.30202013, "learning_rate": 1.055e-05, "elapsed_time_per_iteration": 6.32118154, "memory(GiB)": 21.51, "elapsed_time": "9h 37m 44s", "remaining_time": "1h 59m 11s", "loss_scale": 1.0, "consumed_samples": 1350144, "global_step/max_steps": "5274/6362"} +{"lm loss": 4.87930822, "grad_norm": 0.30240497, "learning_rate": 1.054e-05, "elapsed_time_per_iteration": 6.37167621, "memory(GiB)": 21.51, "elapsed_time": "9h 37m 50s", "remaining_time": "1h 59m 4s", "loss_scale": 1.0, "consumed_samples": 1350400, "global_step/max_steps": "5275/6362"} +{"lm loss": 4.87750626, "grad_norm": 0.29931283, "learning_rate": 1.052e-05, "elapsed_time_per_iteration": 6.25958347, "memory(GiB)": 21.51, "elapsed_time": "9h 37m 56s", "remaining_time": "1h 58m 57s", "loss_scale": 1.0, "consumed_samples": 1350656, "global_step/max_steps": "5276/6362"} +{"lm loss": 4.89197302, "grad_norm": 0.33358681, "learning_rate": 1.051e-05, "elapsed_time_per_iteration": 6.55766463, "memory(GiB)": 21.51, "elapsed_time": "9h 38m 3s", "remaining_time": "1h 58m 51s", "loss_scale": 1.0, "consumed_samples": 1350912, "global_step/max_steps": "5277/6362"} +{"lm loss": 4.86157703, "grad_norm": 0.30188525, "learning_rate": 1.05e-05, "elapsed_time_per_iteration": 6.43783307, "memory(GiB)": 21.51, "elapsed_time": "9h 38m 9s", "remaining_time": "1h 58m 44s", "loss_scale": 1.0, "consumed_samples": 1351168, "global_step/max_steps": "5278/6362"} +{"lm loss": 4.85366392, "grad_norm": 0.30601513, "learning_rate": 1.048e-05, "elapsed_time_per_iteration": 6.35940909, "memory(GiB)": 21.51, "elapsed_time": "9h 38m 16s", "remaining_time": "1h 58m 38s", "loss_scale": 1.0, "consumed_samples": 1351424, "global_step/max_steps": "5279/6362"} +{"lm loss": 4.86007977, "grad_norm": 0.30928105, "learning_rate": 1.047e-05, "elapsed_time_per_iteration": 6.55409908, "memory(GiB)": 21.51, "elapsed_time": "9h 38m 22s", "remaining_time": "1h 58m 31s", "loss_scale": 1.0, "consumed_samples": 1351680, "global_step/max_steps": "5280/6362"} +{"lm loss": 4.87653685, "grad_norm": 0.30075476, "learning_rate": 1.046e-05, "elapsed_time_per_iteration": 6.29722762, "memory(GiB)": 21.51, "elapsed_time": "9h 38m 29s", "remaining_time": "1h 58m 24s", "loss_scale": 1.0, "consumed_samples": 1351936, "global_step/max_steps": "5281/6362"} +{"lm loss": 4.89687634, "grad_norm": 0.29896048, "learning_rate": 1.044e-05, "elapsed_time_per_iteration": 6.27090001, "memory(GiB)": 21.51, "elapsed_time": "9h 38m 35s", "remaining_time": "1h 58m 18s", "loss_scale": 1.0, "consumed_samples": 1352192, "global_step/max_steps": "5282/6362"} +{"lm loss": 4.85790777, "grad_norm": 0.29888058, "learning_rate": 1.043e-05, "elapsed_time_per_iteration": 6.34304309, "memory(GiB)": 21.51, "elapsed_time": "9h 38m 41s", "remaining_time": "1h 58m 11s", "loss_scale": 1.0, "consumed_samples": 1352448, "global_step/max_steps": "5283/6362"} +{"lm loss": 4.88925743, "grad_norm": 0.30899847, "learning_rate": 1.042e-05, "elapsed_time_per_iteration": 6.21630478, "memory(GiB)": 21.51, "elapsed_time": "9h 38m 47s", "remaining_time": "1h 58m 4s", "loss_scale": 1.0, "consumed_samples": 1352704, "global_step/max_steps": "5284/6362"} +{"lm loss": 4.85055637, "grad_norm": 0.30120942, "learning_rate": 1.04e-05, "elapsed_time_per_iteration": 6.31197286, "memory(GiB)": 21.51, "elapsed_time": "9h 38m 54s", "remaining_time": "1h 57m 58s", "loss_scale": 1.0, "consumed_samples": 1352960, "global_step/max_steps": "5285/6362"} +{"lm loss": 4.87738323, "grad_norm": 0.30358589, "learning_rate": 1.039e-05, "elapsed_time_per_iteration": 6.46523929, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 0s", "remaining_time": "1h 57m 51s", "loss_scale": 1.0, "consumed_samples": 1353216, "global_step/max_steps": "5286/6362"} +{"lm loss": 4.87326717, "grad_norm": 0.29384148, "learning_rate": 1.038e-05, "elapsed_time_per_iteration": 6.30467391, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 6s", "remaining_time": "1h 57m 45s", "loss_scale": 1.0, "consumed_samples": 1353472, "global_step/max_steps": "5287/6362"} +{"lm loss": 4.88999367, "grad_norm": 0.30007923, "learning_rate": 1.036e-05, "elapsed_time_per_iteration": 6.46490622, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 13s", "remaining_time": "1h 57m 38s", "loss_scale": 1.0, "consumed_samples": 1353728, "global_step/max_steps": "5288/6362"} +{"lm loss": 4.86157131, "grad_norm": 0.31922585, "learning_rate": 1.035e-05, "elapsed_time_per_iteration": 6.68082094, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 20s", "remaining_time": "1h 57m 31s", "loss_scale": 1.0, "consumed_samples": 1353984, "global_step/max_steps": "5289/6362"} +{"lm loss": 4.86380816, "grad_norm": 0.29918915, "learning_rate": 1.034e-05, "elapsed_time_per_iteration": 6.51575351, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 26s", "remaining_time": "1h 57m 25s", "loss_scale": 1.0, "consumed_samples": 1354240, "global_step/max_steps": "5290/6362"} +{"lm loss": 4.87655878, "grad_norm": 0.32382503, "learning_rate": 1.032e-05, "elapsed_time_per_iteration": 6.63209462, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 33s", "remaining_time": "1h 57m 18s", "loss_scale": 1.0, "consumed_samples": 1354496, "global_step/max_steps": "5291/6362"} +{"lm loss": 4.85138226, "grad_norm": 0.30255148, "learning_rate": 1.031e-05, "elapsed_time_per_iteration": 6.66867018, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 39s", "remaining_time": "1h 57m 12s", "loss_scale": 1.0, "consumed_samples": 1354752, "global_step/max_steps": "5292/6362"} +{"lm loss": 4.89127064, "grad_norm": 0.33040896, "learning_rate": 1.03e-05, "elapsed_time_per_iteration": 6.63748312, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 46s", "remaining_time": "1h 57m 5s", "loss_scale": 1.0, "consumed_samples": 1355008, "global_step/max_steps": "5293/6362"} +{"lm loss": 4.86855984, "grad_norm": 0.29455176, "learning_rate": 1.028e-05, "elapsed_time_per_iteration": 6.36031294, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 52s", "remaining_time": "1h 56m 59s", "loss_scale": 1.0, "consumed_samples": 1355264, "global_step/max_steps": "5294/6362"} +{"lm loss": 4.87105989, "grad_norm": 0.32522172, "learning_rate": 1.027e-05, "elapsed_time_per_iteration": 6.41172552, "memory(GiB)": 21.51, "elapsed_time": "9h 39m 59s", "remaining_time": "1h 56m 52s", "loss_scale": 1.0, "consumed_samples": 1355520, "global_step/max_steps": "5295/6362"} +{"lm loss": 4.89637709, "grad_norm": 0.30197266, "learning_rate": 1.026e-05, "elapsed_time_per_iteration": 6.67925358, "memory(GiB)": 21.51, "elapsed_time": "9h 40m 5s", "remaining_time": "1h 56m 45s", "loss_scale": 1.0, "consumed_samples": 1355776, "global_step/max_steps": "5296/6362"} +{"lm loss": 4.87294722, "grad_norm": 0.30481717, "learning_rate": 1.024e-05, "elapsed_time_per_iteration": 6.55887461, "memory(GiB)": 21.51, "elapsed_time": "9h 40m 12s", "remaining_time": "1h 56m 39s", "loss_scale": 1.0, "consumed_samples": 1356032, "global_step/max_steps": "5297/6362"} +{"lm loss": 4.84725332, "grad_norm": 0.32441184, "learning_rate": 1.023e-05, "elapsed_time_per_iteration": 6.52747202, "memory(GiB)": 21.51, "elapsed_time": "9h 40m 19s", "remaining_time": "1h 56m 32s", "loss_scale": 1.0, "consumed_samples": 1356288, "global_step/max_steps": "5298/6362"} +{"lm loss": 4.86214447, "grad_norm": 0.29350626, "learning_rate": 1.022e-05, "elapsed_time_per_iteration": 6.60614657, "memory(GiB)": 21.51, "elapsed_time": "9h 40m 25s", "remaining_time": "1h 56m 26s", "loss_scale": 1.0, "consumed_samples": 1356544, "global_step/max_steps": "5299/6362"} +{"lm loss": 4.84255266, "grad_norm": 0.30849418, "learning_rate": 1.02e-05, "elapsed_time_per_iteration": 6.48938131, "memory(GiB)": 21.51, "elapsed_time": "9h 40m 32s", "remaining_time": "1h 56m 19s", "loss_scale": 1.0, "consumed_samples": 1356800, "global_step/max_steps": "5300/6362"} +{"lm loss": 4.89452314, "grad_norm": 0.30349043, "learning_rate": 1.019e-05, "elapsed_time_per_iteration": 6.49692512, "memory(GiB)": 21.51, "elapsed_time": "9h 40m 38s", "remaining_time": "1h 56m 12s", "loss_scale": 1.0, "consumed_samples": 1357056, "global_step/max_steps": "5301/6362"} +{"lm loss": 4.87008905, "grad_norm": 0.32045245, "learning_rate": 1.018e-05, "elapsed_time_per_iteration": 6.57566595, "memory(GiB)": 21.51, "elapsed_time": "9h 40m 45s", "remaining_time": "1h 56m 6s", "loss_scale": 1.0, "consumed_samples": 1357312, "global_step/max_steps": "5302/6362"} +{"lm loss": 4.88848448, "grad_norm": 0.30627, "learning_rate": 1.016e-05, "elapsed_time_per_iteration": 6.62929153, "memory(GiB)": 21.51, "elapsed_time": "9h 40m 51s", "remaining_time": "1h 55m 59s", "loss_scale": 1.0, "consumed_samples": 1357568, "global_step/max_steps": "5303/6362"} +{"lm loss": 4.87417793, "grad_norm": 0.32944316, "learning_rate": 1.015e-05, "elapsed_time_per_iteration": 6.64696932, "memory(GiB)": 21.51, "elapsed_time": "9h 40m 58s", "remaining_time": "1h 55m 53s", "loss_scale": 1.0, "consumed_samples": 1357824, "global_step/max_steps": "5304/6362"} +{"lm loss": 4.89708042, "grad_norm": 0.2890783, "learning_rate": 1.014e-05, "elapsed_time_per_iteration": 6.35065746, "memory(GiB)": 21.51, "elapsed_time": "9h 41m 4s", "remaining_time": "1h 55m 46s", "loss_scale": 1.0, "consumed_samples": 1358080, "global_step/max_steps": "5305/6362"} +{"lm loss": 4.86941719, "grad_norm": 0.32070011, "learning_rate": 1.012e-05, "elapsed_time_per_iteration": 6.38931632, "memory(GiB)": 21.51, "elapsed_time": "9h 41m 11s", "remaining_time": "1h 55m 40s", "loss_scale": 1.0, "consumed_samples": 1358336, "global_step/max_steps": "5306/6362"} +{"lm loss": 4.88150358, "grad_norm": 0.30958253, "learning_rate": 1.011e-05, "elapsed_time_per_iteration": 6.50934029, "memory(GiB)": 21.51, "elapsed_time": "9h 41m 17s", "remaining_time": "1h 55m 33s", "loss_scale": 1.0, "consumed_samples": 1358592, "global_step/max_steps": "5307/6362"} +{"lm loss": 4.86816645, "grad_norm": 0.32698369, "learning_rate": 1.01e-05, "elapsed_time_per_iteration": 6.58904934, "memory(GiB)": 21.51, "elapsed_time": "9h 41m 24s", "remaining_time": "1h 55m 26s", "loss_scale": 1.0, "consumed_samples": 1358848, "global_step/max_steps": "5308/6362"} +{"lm loss": 4.89980936, "grad_norm": 0.31382596, "learning_rate": 1.009e-05, "elapsed_time_per_iteration": 6.55611944, "memory(GiB)": 21.51, "elapsed_time": "9h 41m 30s", "remaining_time": "1h 55m 20s", "loss_scale": 1.0, "consumed_samples": 1359104, "global_step/max_steps": "5309/6362"} +{"lm loss": 4.86104345, "grad_norm": 0.33434969, "learning_rate": 1.007e-05, "elapsed_time_per_iteration": 6.54824972, "memory(GiB)": 21.51, "elapsed_time": "9h 41m 37s", "remaining_time": "1h 55m 13s", "loss_scale": 1.0, "consumed_samples": 1359360, "global_step/max_steps": "5310/6362"} +{"lm loss": 4.86486959, "grad_norm": 0.32501245, "learning_rate": 1.006e-05, "elapsed_time_per_iteration": 6.61673975, "memory(GiB)": 21.51, "elapsed_time": "9h 41m 44s", "remaining_time": "1h 55m 7s", "loss_scale": 1.0, "consumed_samples": 1359616, "global_step/max_steps": "5311/6362"} +{"lm loss": 4.87989712, "grad_norm": 0.306912, "learning_rate": 1.005e-05, "elapsed_time_per_iteration": 6.60587788, "memory(GiB)": 21.51, "elapsed_time": "9h 41m 50s", "remaining_time": "1h 55m 0s", "loss_scale": 1.0, "consumed_samples": 1359872, "global_step/max_steps": "5312/6362"} +{"lm loss": 4.87799883, "grad_norm": 0.32969278, "learning_rate": 1.003e-05, "elapsed_time_per_iteration": 6.49570966, "memory(GiB)": 21.51, "elapsed_time": "9h 41m 57s", "remaining_time": "1h 54m 54s", "loss_scale": 1.0, "consumed_samples": 1360128, "global_step/max_steps": "5313/6362"} +{"lm loss": 4.88499117, "grad_norm": 0.28570715, "learning_rate": 1.002e-05, "elapsed_time_per_iteration": 6.5171783, "memory(GiB)": 21.51, "elapsed_time": "9h 42m 3s", "remaining_time": "1h 54m 47s", "loss_scale": 1.0, "consumed_samples": 1360384, "global_step/max_steps": "5314/6362"} +{"lm loss": 4.85948181, "grad_norm": 0.32637337, "learning_rate": 1.001e-05, "elapsed_time_per_iteration": 6.22106624, "memory(GiB)": 21.51, "elapsed_time": "9h 42m 9s", "remaining_time": "1h 54m 40s", "loss_scale": 1.0, "consumed_samples": 1360640, "global_step/max_steps": "5315/6362"} +{"lm loss": 4.88254786, "grad_norm": 0.30302194, "learning_rate": 9.99e-06, "elapsed_time_per_iteration": 6.38474655, "memory(GiB)": 21.51, "elapsed_time": "9h 42m 16s", "remaining_time": "1h 54m 34s", "loss_scale": 1.0, "consumed_samples": 1360896, "global_step/max_steps": "5316/6362"} +{"lm loss": 4.86142206, "grad_norm": 0.31642473, "learning_rate": 9.98e-06, "elapsed_time_per_iteration": 6.158813, "memory(GiB)": 21.51, "elapsed_time": "9h 42m 22s", "remaining_time": "1h 54m 27s", "loss_scale": 1.0, "consumed_samples": 1361152, "global_step/max_steps": "5317/6362"} +{"lm loss": 4.8565259, "grad_norm": 0.29828811, "learning_rate": 9.97e-06, "elapsed_time_per_iteration": 6.42439342, "memory(GiB)": 21.51, "elapsed_time": "9h 42m 28s", "remaining_time": "1h 54m 20s", "loss_scale": 1.0, "consumed_samples": 1361408, "global_step/max_steps": "5318/6362"} +{"lm loss": 4.88969564, "grad_norm": 0.28863826, "learning_rate": 9.95e-06, "elapsed_time_per_iteration": 6.45366073, "memory(GiB)": 21.51, "elapsed_time": "9h 42m 35s", "remaining_time": "1h 54m 14s", "loss_scale": 1.0, "consumed_samples": 1361664, "global_step/max_steps": "5319/6362"} +{"lm loss": 4.8639183, "grad_norm": 0.2899425, "learning_rate": 9.94e-06, "elapsed_time_per_iteration": 6.42591906, "memory(GiB)": 21.51, "elapsed_time": "9h 42m 41s", "remaining_time": "1h 54m 7s", "loss_scale": 1.0, "consumed_samples": 1361920, "global_step/max_steps": "5320/6362"} +{"lm loss": 4.87757683, "grad_norm": 0.30216712, "learning_rate": 9.93e-06, "elapsed_time_per_iteration": 6.63118482, "memory(GiB)": 21.51, "elapsed_time": "9h 42m 48s", "remaining_time": "1h 54m 1s", "loss_scale": 1.0, "consumed_samples": 1362176, "global_step/max_steps": "5321/6362"} +{"lm loss": 4.8769846, "grad_norm": 0.2881569, "learning_rate": 9.92e-06, "elapsed_time_per_iteration": 6.57087469, "memory(GiB)": 21.51, "elapsed_time": "9h 42m 54s", "remaining_time": "1h 53m 54s", "loss_scale": 1.0, "consumed_samples": 1362432, "global_step/max_steps": "5322/6362"} +{"lm loss": 4.86259699, "grad_norm": 0.29811433, "learning_rate": 9.9e-06, "elapsed_time_per_iteration": 6.51209903, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 1s", "remaining_time": "1h 53m 48s", "loss_scale": 1.0, "consumed_samples": 1362688, "global_step/max_steps": "5323/6362"} +{"lm loss": 4.865623, "grad_norm": 0.30065313, "learning_rate": 9.89e-06, "elapsed_time_per_iteration": 6.67224884, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 8s", "remaining_time": "1h 53m 41s", "loss_scale": 1.0, "consumed_samples": 1362944, "global_step/max_steps": "5324/6362"} +{"lm loss": 4.85999441, "grad_norm": 0.27969018, "learning_rate": 9.88e-06, "elapsed_time_per_iteration": 6.59495616, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 14s", "remaining_time": "1h 53m 34s", "loss_scale": 1.0, "consumed_samples": 1363200, "global_step/max_steps": "5325/6362"} +{"lm loss": 4.87830591, "grad_norm": 0.29328302, "learning_rate": 9.86e-06, "elapsed_time_per_iteration": 6.33502078, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 21s", "remaining_time": "1h 53m 28s", "loss_scale": 1.0, "consumed_samples": 1363456, "global_step/max_steps": "5326/6362"} +{"lm loss": 4.87917614, "grad_norm": 0.2957682, "learning_rate": 9.85e-06, "elapsed_time_per_iteration": 6.59356022, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 27s", "remaining_time": "1h 53m 21s", "loss_scale": 1.0, "consumed_samples": 1363712, "global_step/max_steps": "5327/6362"} +{"lm loss": 4.87757397, "grad_norm": 0.31454587, "learning_rate": 9.84e-06, "elapsed_time_per_iteration": 6.2950418, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 33s", "remaining_time": "1h 53m 15s", "loss_scale": 1.0, "consumed_samples": 1363968, "global_step/max_steps": "5328/6362"} +{"lm loss": 4.89167738, "grad_norm": 0.27852648, "learning_rate": 9.83e-06, "elapsed_time_per_iteration": 6.29557371, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 40s", "remaining_time": "1h 53m 8s", "loss_scale": 1.0, "consumed_samples": 1364224, "global_step/max_steps": "5329/6362"} +{"lm loss": 4.8633256, "grad_norm": 0.30054796, "learning_rate": 9.81e-06, "elapsed_time_per_iteration": 6.46538186, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 46s", "remaining_time": "1h 53m 1s", "loss_scale": 1.0, "consumed_samples": 1364480, "global_step/max_steps": "5330/6362"} +{"lm loss": 4.8811121, "grad_norm": 0.29731929, "learning_rate": 9.8e-06, "elapsed_time_per_iteration": 6.27566648, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 52s", "remaining_time": "1h 52m 55s", "loss_scale": 1.0, "consumed_samples": 1364736, "global_step/max_steps": "5331/6362"} +{"lm loss": 4.87666559, "grad_norm": 0.33087844, "learning_rate": 9.79e-06, "elapsed_time_per_iteration": 6.55658484, "memory(GiB)": 21.51, "elapsed_time": "9h 43m 59s", "remaining_time": "1h 52m 48s", "loss_scale": 1.0, "consumed_samples": 1364992, "global_step/max_steps": "5332/6362"} +{"lm loss": 4.87387657, "grad_norm": 0.30646226, "learning_rate": 9.77e-06, "elapsed_time_per_iteration": 6.63063931, "memory(GiB)": 21.51, "elapsed_time": "9h 44m 6s", "remaining_time": "1h 52m 42s", "loss_scale": 1.0, "consumed_samples": 1365248, "global_step/max_steps": "5333/6362"} +{"lm loss": 4.87693214, "grad_norm": 0.31165963, "learning_rate": 9.76e-06, "elapsed_time_per_iteration": 6.72585726, "memory(GiB)": 21.51, "elapsed_time": "9h 44m 12s", "remaining_time": "1h 52m 35s", "loss_scale": 1.0, "consumed_samples": 1365504, "global_step/max_steps": "5334/6362"} +{"lm loss": 4.85880613, "grad_norm": 0.32583097, "learning_rate": 9.75e-06, "elapsed_time_per_iteration": 6.43312621, "memory(GiB)": 21.51, "elapsed_time": "9h 44m 19s", "remaining_time": "1h 52m 29s", "loss_scale": 1.0, "consumed_samples": 1365760, "global_step/max_steps": "5335/6362"} +{"lm loss": 4.90036535, "grad_norm": 0.30345964, "learning_rate": 9.74e-06, "elapsed_time_per_iteration": 6.43539429, "memory(GiB)": 21.51, "elapsed_time": "9h 44m 25s", "remaining_time": "1h 52m 22s", "loss_scale": 1.0, "consumed_samples": 1366016, "global_step/max_steps": "5336/6362"} +{"lm loss": 4.89008713, "grad_norm": 0.29638183, "learning_rate": 9.72e-06, "elapsed_time_per_iteration": 6.60925007, "memory(GiB)": 21.51, "elapsed_time": "9h 44m 32s", "remaining_time": "1h 52m 15s", "loss_scale": 1.0, "consumed_samples": 1366272, "global_step/max_steps": "5337/6362"} +{"lm loss": 4.87382889, "grad_norm": 0.34693596, "learning_rate": 9.71e-06, "elapsed_time_per_iteration": 6.59727263, "memory(GiB)": 21.51, "elapsed_time": "9h 44m 38s", "remaining_time": "1h 52m 9s", "loss_scale": 1.0, "consumed_samples": 1366528, "global_step/max_steps": "5338/6362"} +{"lm loss": 4.89328909, "grad_norm": 0.28720137, "learning_rate": 9.7e-06, "elapsed_time_per_iteration": 6.43749785, "memory(GiB)": 21.51, "elapsed_time": "9h 44m 45s", "remaining_time": "1h 52m 2s", "loss_scale": 1.0, "consumed_samples": 1366784, "global_step/max_steps": "5339/6362"} +{"lm loss": 4.84929514, "grad_norm": 0.29533482, "learning_rate": 9.68e-06, "elapsed_time_per_iteration": 6.71190953, "memory(GiB)": 21.51, "elapsed_time": "9h 44m 52s", "remaining_time": "1h 51m 56s", "loss_scale": 1.0, "consumed_samples": 1367040, "global_step/max_steps": "5340/6362"} +{"lm loss": 4.86821461, "grad_norm": 0.29735467, "learning_rate": 9.67e-06, "elapsed_time_per_iteration": 6.26847291, "memory(GiB)": 21.51, "elapsed_time": "9h 44m 58s", "remaining_time": "1h 51m 49s", "loss_scale": 1.0, "consumed_samples": 1367296, "global_step/max_steps": "5341/6362"} +{"lm loss": 4.85924292, "grad_norm": 0.30521896, "learning_rate": 9.66e-06, "elapsed_time_per_iteration": 6.4352088, "memory(GiB)": 21.51, "elapsed_time": "9h 45m 4s", "remaining_time": "1h 51m 42s", "loss_scale": 1.0, "consumed_samples": 1367552, "global_step/max_steps": "5342/6362"} +{"lm loss": 4.83881521, "grad_norm": 0.30097088, "learning_rate": 9.65e-06, "elapsed_time_per_iteration": 6.32538128, "memory(GiB)": 21.51, "elapsed_time": "9h 45m 11s", "remaining_time": "1h 51m 36s", "loss_scale": 1.0, "consumed_samples": 1367808, "global_step/max_steps": "5343/6362"} +{"lm loss": 4.85588884, "grad_norm": 0.30028734, "learning_rate": 9.63e-06, "elapsed_time_per_iteration": 6.37850308, "memory(GiB)": 21.51, "elapsed_time": "9h 45m 17s", "remaining_time": "1h 51m 29s", "loss_scale": 1.0, "consumed_samples": 1368064, "global_step/max_steps": "5344/6362"} +{"lm loss": 4.86053371, "grad_norm": 0.30122656, "learning_rate": 9.62e-06, "elapsed_time_per_iteration": 6.58883953, "memory(GiB)": 21.51, "elapsed_time": "9h 45m 24s", "remaining_time": "1h 51m 23s", "loss_scale": 1.0, "consumed_samples": 1368320, "global_step/max_steps": "5345/6362"} +{"lm loss": 4.85793686, "grad_norm": 0.29021591, "learning_rate": 9.61e-06, "elapsed_time_per_iteration": 6.42534137, "memory(GiB)": 21.51, "elapsed_time": "9h 45m 30s", "remaining_time": "1h 51m 16s", "loss_scale": 1.0, "consumed_samples": 1368576, "global_step/max_steps": "5346/6362"} +{"lm loss": 4.87766171, "grad_norm": 0.29042017, "learning_rate": 9.59e-06, "elapsed_time_per_iteration": 6.63255119, "memory(GiB)": 21.51, "elapsed_time": "9h 45m 37s", "remaining_time": "1h 51m 9s", "loss_scale": 1.0, "consumed_samples": 1368832, "global_step/max_steps": "5347/6362"} +{"lm loss": 4.86514807, "grad_norm": 0.29441449, "learning_rate": 9.58e-06, "elapsed_time_per_iteration": 6.47733426, "memory(GiB)": 21.51, "elapsed_time": "9h 45m 43s", "remaining_time": "1h 51m 3s", "loss_scale": 1.0, "consumed_samples": 1369088, "global_step/max_steps": "5348/6362"} +{"lm loss": 4.87469101, "grad_norm": 0.29788271, "learning_rate": 9.57e-06, "elapsed_time_per_iteration": 6.28415489, "memory(GiB)": 21.51, "elapsed_time": "9h 45m 49s", "remaining_time": "1h 50m 56s", "loss_scale": 1.0, "consumed_samples": 1369344, "global_step/max_steps": "5349/6362"} +{"lm loss": 4.84952736, "grad_norm": 0.29138836, "learning_rate": 9.56e-06, "elapsed_time_per_iteration": 6.5403142, "memory(GiB)": 21.51, "elapsed_time": "9h 45m 56s", "remaining_time": "1h 50m 50s", "loss_scale": 1.0, "consumed_samples": 1369600, "global_step/max_steps": "5350/6362"} +{"lm loss": 4.83677483, "grad_norm": 0.30218366, "learning_rate": 9.54e-06, "elapsed_time_per_iteration": 6.59781384, "memory(GiB)": 21.51, "elapsed_time": "9h 46m 3s", "remaining_time": "1h 50m 43s", "loss_scale": 1.0, "consumed_samples": 1369856, "global_step/max_steps": "5351/6362"} +{"lm loss": 4.86903429, "grad_norm": 0.29345712, "learning_rate": 9.53e-06, "elapsed_time_per_iteration": 6.52802467, "memory(GiB)": 21.51, "elapsed_time": "9h 46m 9s", "remaining_time": "1h 50m 37s", "loss_scale": 1.0, "consumed_samples": 1370112, "global_step/max_steps": "5352/6362"} +{"lm loss": 4.88386059, "grad_norm": 0.29513451, "learning_rate": 9.52e-06, "elapsed_time_per_iteration": 6.64568448, "memory(GiB)": 21.51, "elapsed_time": "9h 46m 16s", "remaining_time": "1h 50m 30s", "loss_scale": 1.0, "consumed_samples": 1370368, "global_step/max_steps": "5353/6362"} +{"lm loss": 4.86170387, "grad_norm": 0.29140547, "learning_rate": 9.51e-06, "elapsed_time_per_iteration": 6.64438486, "memory(GiB)": 21.51, "elapsed_time": "9h 46m 22s", "remaining_time": "1h 50m 23s", "loss_scale": 1.0, "consumed_samples": 1370624, "global_step/max_steps": "5354/6362"} +{"lm loss": 4.88972998, "grad_norm": 0.28915292, "learning_rate": 9.49e-06, "elapsed_time_per_iteration": 6.45838857, "memory(GiB)": 21.51, "elapsed_time": "9h 46m 29s", "remaining_time": "1h 50m 17s", "loss_scale": 1.0, "consumed_samples": 1370880, "global_step/max_steps": "5355/6362"} +{"lm loss": 4.85688257, "grad_norm": 0.30572432, "learning_rate": 9.48e-06, "elapsed_time_per_iteration": 6.52055264, "memory(GiB)": 21.51, "elapsed_time": "9h 46m 35s", "remaining_time": "1h 50m 10s", "loss_scale": 1.0, "consumed_samples": 1371136, "global_step/max_steps": "5356/6362"} +{"lm loss": 4.8730135, "grad_norm": 0.28835604, "learning_rate": 9.47e-06, "elapsed_time_per_iteration": 6.39827967, "memory(GiB)": 21.51, "elapsed_time": "9h 46m 42s", "remaining_time": "1h 50m 4s", "loss_scale": 1.0, "consumed_samples": 1371392, "global_step/max_steps": "5357/6362"} +{"lm loss": 4.87829733, "grad_norm": 0.29046994, "learning_rate": 9.46e-06, "elapsed_time_per_iteration": 6.29850078, "memory(GiB)": 21.51, "elapsed_time": "9h 46m 48s", "remaining_time": "1h 49m 57s", "loss_scale": 1.0, "consumed_samples": 1371648, "global_step/max_steps": "5358/6362"} +{"lm loss": 4.85465956, "grad_norm": 0.2884011, "learning_rate": 9.44e-06, "elapsed_time_per_iteration": 6.35757327, "memory(GiB)": 21.51, "elapsed_time": "9h 46m 54s", "remaining_time": "1h 49m 50s", "loss_scale": 1.0, "consumed_samples": 1371904, "global_step/max_steps": "5359/6362"} +{"lm loss": 4.87548161, "grad_norm": 0.29846069, "learning_rate": 9.43e-06, "elapsed_time_per_iteration": 6.43380117, "memory(GiB)": 21.51, "elapsed_time": "9h 47m 1s", "remaining_time": "1h 49m 44s", "loss_scale": 1.0, "consumed_samples": 1372160, "global_step/max_steps": "5360/6362"} +{"lm loss": 4.87838554, "grad_norm": 0.29735258, "learning_rate": 9.42e-06, "elapsed_time_per_iteration": 6.7005868, "memory(GiB)": 21.51, "elapsed_time": "9h 47m 8s", "remaining_time": "1h 49m 37s", "loss_scale": 1.0, "consumed_samples": 1372416, "global_step/max_steps": "5361/6362"} +{"lm loss": 4.87340546, "grad_norm": 0.2851761, "learning_rate": 9.41e-06, "elapsed_time_per_iteration": 6.38076138, "memory(GiB)": 21.51, "elapsed_time": "9h 47m 14s", "remaining_time": "1h 49m 31s", "loss_scale": 1.0, "consumed_samples": 1372672, "global_step/max_steps": "5362/6362"} +{"lm loss": 4.85668707, "grad_norm": 0.28896782, "learning_rate": 9.39e-06, "elapsed_time_per_iteration": 6.53100348, "memory(GiB)": 21.51, "elapsed_time": "9h 47m 20s", "remaining_time": "1h 49m 24s", "loss_scale": 1.0, "consumed_samples": 1372928, "global_step/max_steps": "5363/6362"} +{"lm loss": 4.88397408, "grad_norm": 0.31499013, "learning_rate": 9.38e-06, "elapsed_time_per_iteration": 6.5834868, "memory(GiB)": 21.51, "elapsed_time": "9h 47m 27s", "remaining_time": "1h 49m 17s", "loss_scale": 1.0, "consumed_samples": 1373184, "global_step/max_steps": "5364/6362"} +{"lm loss": 4.85727644, "grad_norm": 0.30428144, "learning_rate": 9.37e-06, "elapsed_time_per_iteration": 6.53597045, "memory(GiB)": 21.51, "elapsed_time": "9h 47m 34s", "remaining_time": "1h 49m 11s", "loss_scale": 1.0, "consumed_samples": 1373440, "global_step/max_steps": "5365/6362"} +{"lm loss": 4.88404703, "grad_norm": 0.30020797, "learning_rate": 9.36e-06, "elapsed_time_per_iteration": 6.55825567, "memory(GiB)": 21.51, "elapsed_time": "9h 47m 40s", "remaining_time": "1h 49m 4s", "loss_scale": 1.0, "consumed_samples": 1373696, "global_step/max_steps": "5366/6362"} +{"lm loss": 4.89174509, "grad_norm": 0.30875814, "learning_rate": 9.34e-06, "elapsed_time_per_iteration": 6.58143687, "memory(GiB)": 21.51, "elapsed_time": "9h 47m 47s", "remaining_time": "1h 48m 58s", "loss_scale": 1.0, "consumed_samples": 1373952, "global_step/max_steps": "5367/6362"} +{"lm loss": 4.87498474, "grad_norm": 0.30438316, "learning_rate": 9.33e-06, "elapsed_time_per_iteration": 6.59713817, "memory(GiB)": 21.51, "elapsed_time": "9h 47m 53s", "remaining_time": "1h 48m 51s", "loss_scale": 1.0, "consumed_samples": 1374208, "global_step/max_steps": "5368/6362"} +{"lm loss": 4.86952829, "grad_norm": 0.31010747, "learning_rate": 9.32e-06, "elapsed_time_per_iteration": 6.59463668, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 0s", "remaining_time": "1h 48m 45s", "loss_scale": 1.0, "consumed_samples": 1374464, "global_step/max_steps": "5369/6362"} +{"lm loss": 4.84552431, "grad_norm": 0.31170562, "learning_rate": 9.31e-06, "elapsed_time_per_iteration": 7.02690864, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 7s", "remaining_time": "1h 48m 38s", "loss_scale": 1.0, "consumed_samples": 1374720, "global_step/max_steps": "5370/6362"} +{"lm loss": 4.87882233, "grad_norm": 0.28109947, "learning_rate": 9.29e-06, "elapsed_time_per_iteration": 6.6710763, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 14s", "remaining_time": "1h 48m 32s", "loss_scale": 1.0, "consumed_samples": 1374976, "global_step/max_steps": "5371/6362"} +{"lm loss": 4.87222052, "grad_norm": 0.33742878, "learning_rate": 9.28e-06, "elapsed_time_per_iteration": 6.44732761, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 20s", "remaining_time": "1h 48m 25s", "loss_scale": 1.0, "consumed_samples": 1375232, "global_step/max_steps": "5372/6362"} +{"lm loss": 4.84494495, "grad_norm": 0.31435806, "learning_rate": 9.27e-06, "elapsed_time_per_iteration": 6.52281165, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 27s", "remaining_time": "1h 48m 18s", "loss_scale": 1.0, "consumed_samples": 1375488, "global_step/max_steps": "5373/6362"} +{"lm loss": 4.87531996, "grad_norm": 0.32913369, "learning_rate": 9.26e-06, "elapsed_time_per_iteration": 6.59792972, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 33s", "remaining_time": "1h 48m 12s", "loss_scale": 1.0, "consumed_samples": 1375744, "global_step/max_steps": "5374/6362"} +{"lm loss": 4.85715628, "grad_norm": 0.29103896, "learning_rate": 9.24e-06, "elapsed_time_per_iteration": 6.62403083, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 40s", "remaining_time": "1h 48m 5s", "loss_scale": 1.0, "consumed_samples": 1376000, "global_step/max_steps": "5375/6362"} +{"lm loss": 4.85562754, "grad_norm": 0.30886695, "learning_rate": 9.23e-06, "elapsed_time_per_iteration": 6.69911218, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 47s", "remaining_time": "1h 47m 59s", "loss_scale": 1.0, "consumed_samples": 1376256, "global_step/max_steps": "5376/6362"} +{"lm loss": 4.86448383, "grad_norm": 0.28590792, "learning_rate": 9.22e-06, "elapsed_time_per_iteration": 6.44981503, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 53s", "remaining_time": "1h 47m 52s", "loss_scale": 1.0, "consumed_samples": 1376512, "global_step/max_steps": "5377/6362"} +{"lm loss": 4.85651398, "grad_norm": 0.30774575, "learning_rate": 9.21e-06, "elapsed_time_per_iteration": 6.49247742, "memory(GiB)": 21.51, "elapsed_time": "9h 48m 59s", "remaining_time": "1h 47m 46s", "loss_scale": 1.0, "consumed_samples": 1376768, "global_step/max_steps": "5378/6362"} +{"lm loss": 4.88003445, "grad_norm": 0.29088008, "learning_rate": 9.19e-06, "elapsed_time_per_iteration": 6.5434351, "memory(GiB)": 21.51, "elapsed_time": "9h 49m 6s", "remaining_time": "1h 47m 39s", "loss_scale": 1.0, "consumed_samples": 1377024, "global_step/max_steps": "5379/6362"} +{"lm loss": 4.84443474, "grad_norm": 0.2830511, "learning_rate": 9.18e-06, "elapsed_time_per_iteration": 6.3796618, "memory(GiB)": 21.51, "elapsed_time": "9h 49m 12s", "remaining_time": "1h 47m 32s", "loss_scale": 1.0, "consumed_samples": 1377280, "global_step/max_steps": "5380/6362"} +{"lm loss": 4.87896156, "grad_norm": 0.30292726, "learning_rate": 9.17e-06, "elapsed_time_per_iteration": 6.53543425, "memory(GiB)": 21.51, "elapsed_time": "9h 49m 19s", "remaining_time": "1h 47m 26s", "loss_scale": 1.0, "consumed_samples": 1377536, "global_step/max_steps": "5381/6362"} +{"lm loss": 4.87937593, "grad_norm": 0.29949847, "learning_rate": 9.16e-06, "elapsed_time_per_iteration": 6.78123522, "memory(GiB)": 21.51, "elapsed_time": "9h 49m 26s", "remaining_time": "1h 47m 19s", "loss_scale": 1.0, "consumed_samples": 1377792, "global_step/max_steps": "5382/6362"} +{"lm loss": 4.89180326, "grad_norm": 0.30444315, "learning_rate": 9.15e-06, "elapsed_time_per_iteration": 6.35021663, "memory(GiB)": 21.51, "elapsed_time": "9h 49m 32s", "remaining_time": "1h 47m 13s", "loss_scale": 1.0, "consumed_samples": 1378048, "global_step/max_steps": "5383/6362"} +{"lm loss": 4.8822341, "grad_norm": 0.31083813, "learning_rate": 9.13e-06, "elapsed_time_per_iteration": 6.53693366, "memory(GiB)": 21.51, "elapsed_time": "9h 49m 39s", "remaining_time": "1h 47m 6s", "loss_scale": 1.0, "consumed_samples": 1378304, "global_step/max_steps": "5384/6362"} +{"lm loss": 4.86099148, "grad_norm": 0.29683658, "learning_rate": 9.12e-06, "elapsed_time_per_iteration": 6.40201569, "memory(GiB)": 21.51, "elapsed_time": "9h 49m 45s", "remaining_time": "1h 46m 59s", "loss_scale": 1.0, "consumed_samples": 1378560, "global_step/max_steps": "5385/6362"} +{"lm loss": 4.85866785, "grad_norm": 0.3117798, "learning_rate": 9.11e-06, "elapsed_time_per_iteration": 6.51012158, "memory(GiB)": 21.51, "elapsed_time": "9h 49m 51s", "remaining_time": "1h 46m 53s", "loss_scale": 1.0, "consumed_samples": 1378816, "global_step/max_steps": "5386/6362"} +{"lm loss": 4.8624773, "grad_norm": 0.30314961, "learning_rate": 9.1e-06, "elapsed_time_per_iteration": 6.38103867, "memory(GiB)": 21.51, "elapsed_time": "9h 49m 58s", "remaining_time": "1h 46m 46s", "loss_scale": 1.0, "consumed_samples": 1379072, "global_step/max_steps": "5387/6362"} +{"lm loss": 4.8695035, "grad_norm": 0.29840964, "learning_rate": 9.08e-06, "elapsed_time_per_iteration": 6.52969074, "memory(GiB)": 21.51, "elapsed_time": "9h 50m 4s", "remaining_time": "1h 46m 40s", "loss_scale": 1.0, "consumed_samples": 1379328, "global_step/max_steps": "5388/6362"} +{"lm loss": 4.86604643, "grad_norm": 0.30026123, "learning_rate": 9.07e-06, "elapsed_time_per_iteration": 6.60460401, "memory(GiB)": 21.51, "elapsed_time": "9h 50m 11s", "remaining_time": "1h 46m 33s", "loss_scale": 1.0, "consumed_samples": 1379584, "global_step/max_steps": "5389/6362"} +{"lm loss": 4.86993122, "grad_norm": 0.31112415, "learning_rate": 9.06e-06, "elapsed_time_per_iteration": 6.488657, "memory(GiB)": 21.51, "elapsed_time": "9h 50m 18s", "remaining_time": "1h 46m 27s", "loss_scale": 1.0, "consumed_samples": 1379840, "global_step/max_steps": "5390/6362"} +{"lm loss": 4.86720324, "grad_norm": 0.29906961, "learning_rate": 9.05e-06, "elapsed_time_per_iteration": 6.4172852, "memory(GiB)": 21.51, "elapsed_time": "9h 50m 24s", "remaining_time": "1h 46m 20s", "loss_scale": 1.0, "consumed_samples": 1380096, "global_step/max_steps": "5391/6362"} +{"lm loss": 4.8811841, "grad_norm": 0.32285374, "learning_rate": 9.04e-06, "elapsed_time_per_iteration": 6.79862666, "memory(GiB)": 21.51, "elapsed_time": "9h 50m 31s", "remaining_time": "1h 46m 13s", "loss_scale": 1.0, "consumed_samples": 1380352, "global_step/max_steps": "5392/6362"} +{"lm loss": 4.85830593, "grad_norm": 0.28752431, "learning_rate": 9.02e-06, "elapsed_time_per_iteration": 6.33759546, "memory(GiB)": 21.51, "elapsed_time": "9h 50m 37s", "remaining_time": "1h 46m 7s", "loss_scale": 1.0, "consumed_samples": 1380608, "global_step/max_steps": "5393/6362"} +{"lm loss": 4.88255358, "grad_norm": 0.30419004, "learning_rate": 9.01e-06, "elapsed_time_per_iteration": 6.77563357, "memory(GiB)": 21.51, "elapsed_time": "9h 50m 44s", "remaining_time": "1h 46m 0s", "loss_scale": 1.0, "consumed_samples": 1380864, "global_step/max_steps": "5394/6362"} +{"lm loss": 4.87782145, "grad_norm": 0.3052502, "learning_rate": 9e-06, "elapsed_time_per_iteration": 6.54659963, "memory(GiB)": 21.51, "elapsed_time": "9h 50m 50s", "remaining_time": "1h 45m 54s", "loss_scale": 1.0, "consumed_samples": 1381120, "global_step/max_steps": "5395/6362"} +{"lm loss": 4.86445618, "grad_norm": 0.30756453, "learning_rate": 8.99e-06, "elapsed_time_per_iteration": 6.68578172, "memory(GiB)": 21.51, "elapsed_time": "9h 50m 57s", "remaining_time": "1h 45m 47s", "loss_scale": 1.0, "consumed_samples": 1381376, "global_step/max_steps": "5396/6362"} +{"lm loss": 4.86575222, "grad_norm": 0.30913475, "learning_rate": 8.97e-06, "elapsed_time_per_iteration": 6.56240034, "memory(GiB)": 21.51, "elapsed_time": "9h 51m 4s", "remaining_time": "1h 45m 41s", "loss_scale": 1.0, "consumed_samples": 1381632, "global_step/max_steps": "5397/6362"} +{"lm loss": 4.87567377, "grad_norm": 0.28880754, "learning_rate": 8.96e-06, "elapsed_time_per_iteration": 6.64983106, "memory(GiB)": 21.51, "elapsed_time": "9h 51m 10s", "remaining_time": "1h 45m 34s", "loss_scale": 1.0, "consumed_samples": 1381888, "global_step/max_steps": "5398/6362"} +{"lm loss": 4.8800869, "grad_norm": 0.33790538, "learning_rate": 8.95e-06, "elapsed_time_per_iteration": 6.49800062, "memory(GiB)": 21.51, "elapsed_time": "9h 51m 17s", "remaining_time": "1h 45m 27s", "loss_scale": 1.0, "consumed_samples": 1382144, "global_step/max_steps": "5399/6362"} +{"lm loss": 4.85824442, "grad_norm": 0.29541495, "learning_rate": 8.94e-06, "elapsed_time_per_iteration": 6.66232753, "memory(GiB)": 21.51, "elapsed_time": "9h 51m 23s", "remaining_time": "1h 45m 21s", "loss_scale": 1.0, "consumed_samples": 1382400, "global_step/max_steps": "5400/6362"} +{"lm loss": 4.87291002, "grad_norm": 0.33251026, "learning_rate": 8.93e-06, "elapsed_time_per_iteration": 6.956007, "memory(GiB)": 21.51, "elapsed_time": "9h 51m 30s", "remaining_time": "1h 45m 14s", "loss_scale": 1.0, "consumed_samples": 1382656, "global_step/max_steps": "5401/6362"} +{"lm loss": 4.86364841, "grad_norm": 0.31325293, "learning_rate": 8.91e-06, "elapsed_time_per_iteration": 6.48492622, "memory(GiB)": 21.51, "elapsed_time": "9h 51m 37s", "remaining_time": "1h 45m 8s", "loss_scale": 1.0, "consumed_samples": 1382912, "global_step/max_steps": "5402/6362"} +{"lm loss": 4.88514519, "grad_norm": 0.29129234, "learning_rate": 8.9e-06, "elapsed_time_per_iteration": 6.60115767, "memory(GiB)": 21.51, "elapsed_time": "9h 51m 43s", "remaining_time": "1h 45m 1s", "loss_scale": 1.0, "consumed_samples": 1383168, "global_step/max_steps": "5403/6362"} +{"lm loss": 4.85521603, "grad_norm": 0.32407945, "learning_rate": 8.89e-06, "elapsed_time_per_iteration": 6.50459051, "memory(GiB)": 21.51, "elapsed_time": "9h 51m 50s", "remaining_time": "1h 44m 55s", "loss_scale": 1.0, "consumed_samples": 1383424, "global_step/max_steps": "5404/6362"} +{"lm loss": 4.8577528, "grad_norm": 0.30235586, "learning_rate": 8.88e-06, "elapsed_time_per_iteration": 6.66316271, "memory(GiB)": 21.51, "elapsed_time": "9h 51m 57s", "remaining_time": "1h 44m 48s", "loss_scale": 1.0, "consumed_samples": 1383680, "global_step/max_steps": "5405/6362"} +{"lm loss": 4.89444208, "grad_norm": 0.29714632, "learning_rate": 8.87e-06, "elapsed_time_per_iteration": 6.60780358, "memory(GiB)": 21.51, "elapsed_time": "9h 52m 3s", "remaining_time": "1h 44m 42s", "loss_scale": 1.0, "consumed_samples": 1383936, "global_step/max_steps": "5406/6362"} +{"lm loss": 4.86692953, "grad_norm": 0.29825321, "learning_rate": 8.85e-06, "elapsed_time_per_iteration": 6.5592618, "memory(GiB)": 21.51, "elapsed_time": "9h 52m 10s", "remaining_time": "1h 44m 35s", "loss_scale": 1.0, "consumed_samples": 1384192, "global_step/max_steps": "5407/6362"} +{"lm loss": 4.85732746, "grad_norm": 0.29622808, "learning_rate": 8.84e-06, "elapsed_time_per_iteration": 6.76279664, "memory(GiB)": 21.51, "elapsed_time": "9h 52m 17s", "remaining_time": "1h 44m 28s", "loss_scale": 1.0, "consumed_samples": 1384448, "global_step/max_steps": "5408/6362"} +{"lm loss": 4.85945511, "grad_norm": 0.30587348, "learning_rate": 8.83e-06, "elapsed_time_per_iteration": 6.60787129, "memory(GiB)": 21.51, "elapsed_time": "9h 52m 23s", "remaining_time": "1h 44m 22s", "loss_scale": 1.0, "consumed_samples": 1384704, "global_step/max_steps": "5409/6362"} +{"lm loss": 4.87986088, "grad_norm": 0.2942926, "learning_rate": 8.82e-06, "elapsed_time_per_iteration": 6.43504739, "memory(GiB)": 21.51, "elapsed_time": "9h 52m 30s", "remaining_time": "1h 44m 15s", "loss_scale": 1.0, "consumed_samples": 1384960, "global_step/max_steps": "5410/6362"} +{"lm loss": 4.88095903, "grad_norm": 0.30583829, "learning_rate": 8.81e-06, "elapsed_time_per_iteration": 6.51318502, "memory(GiB)": 21.51, "elapsed_time": "9h 52m 36s", "remaining_time": "1h 44m 9s", "loss_scale": 1.0, "consumed_samples": 1385216, "global_step/max_steps": "5411/6362"} +{"lm loss": 4.8569293, "grad_norm": 0.29187098, "learning_rate": 8.79e-06, "elapsed_time_per_iteration": 6.2041018, "memory(GiB)": 21.51, "elapsed_time": "9h 52m 42s", "remaining_time": "1h 44m 2s", "loss_scale": 1.0, "consumed_samples": 1385472, "global_step/max_steps": "5412/6362"} +{"lm loss": 4.86458158, "grad_norm": 0.30497929, "learning_rate": 8.78e-06, "elapsed_time_per_iteration": 6.53637934, "memory(GiB)": 21.51, "elapsed_time": "9h 52m 49s", "remaining_time": "1h 43m 55s", "loss_scale": 1.0, "consumed_samples": 1385728, "global_step/max_steps": "5413/6362"} +{"lm loss": 4.86338854, "grad_norm": 0.28643155, "learning_rate": 8.77e-06, "elapsed_time_per_iteration": 6.56030726, "memory(GiB)": 21.51, "elapsed_time": "9h 52m 55s", "remaining_time": "1h 43m 49s", "loss_scale": 1.0, "consumed_samples": 1385984, "global_step/max_steps": "5414/6362"} +{"lm loss": 4.86928463, "grad_norm": 0.28024837, "learning_rate": 8.76e-06, "elapsed_time_per_iteration": 6.45878315, "memory(GiB)": 21.51, "elapsed_time": "9h 53m 2s", "remaining_time": "1h 43m 42s", "loss_scale": 1.0, "consumed_samples": 1386240, "global_step/max_steps": "5415/6362"} +{"lm loss": 4.87898254, "grad_norm": 0.27977085, "learning_rate": 8.75e-06, "elapsed_time_per_iteration": 6.27282643, "memory(GiB)": 21.51, "elapsed_time": "9h 53m 8s", "remaining_time": "1h 43m 36s", "loss_scale": 1.0, "consumed_samples": 1386496, "global_step/max_steps": "5416/6362"} +{"lm loss": 4.89908361, "grad_norm": 0.30710036, "learning_rate": 8.73e-06, "elapsed_time_per_iteration": 6.39829016, "memory(GiB)": 21.51, "elapsed_time": "9h 53m 15s", "remaining_time": "1h 43m 29s", "loss_scale": 1.0, "consumed_samples": 1386752, "global_step/max_steps": "5417/6362"} +{"lm loss": 4.88021183, "grad_norm": 0.28825885, "learning_rate": 8.72e-06, "elapsed_time_per_iteration": 6.48104978, "memory(GiB)": 21.51, "elapsed_time": "9h 53m 21s", "remaining_time": "1h 43m 23s", "loss_scale": 1.0, "consumed_samples": 1387008, "global_step/max_steps": "5418/6362"} +{"lm loss": 4.86298132, "grad_norm": 0.28762943, "learning_rate": 8.71e-06, "elapsed_time_per_iteration": 6.59440923, "memory(GiB)": 21.51, "elapsed_time": "9h 53m 28s", "remaining_time": "1h 43m 16s", "loss_scale": 1.0, "consumed_samples": 1387264, "global_step/max_steps": "5419/6362"} +{"lm loss": 4.88055754, "grad_norm": 0.28983301, "learning_rate": 8.7e-06, "elapsed_time_per_iteration": 6.48399282, "memory(GiB)": 21.51, "elapsed_time": "9h 53m 34s", "remaining_time": "1h 43m 9s", "loss_scale": 1.0, "consumed_samples": 1387520, "global_step/max_steps": "5420/6362"} +{"lm loss": 4.8638792, "grad_norm": 0.28775495, "learning_rate": 8.69e-06, "elapsed_time_per_iteration": 6.46361446, "memory(GiB)": 21.51, "elapsed_time": "9h 53m 41s", "remaining_time": "1h 43m 3s", "loss_scale": 1.0, "consumed_samples": 1387776, "global_step/max_steps": "5421/6362"} +{"lm loss": 4.84877872, "grad_norm": 0.29464293, "learning_rate": 8.68e-06, "elapsed_time_per_iteration": 6.43467855, "memory(GiB)": 21.51, "elapsed_time": "9h 53m 47s", "remaining_time": "1h 42m 56s", "loss_scale": 1.0, "consumed_samples": 1388032, "global_step/max_steps": "5422/6362"} +{"lm loss": 4.87363577, "grad_norm": 0.28502429, "learning_rate": 8.66e-06, "elapsed_time_per_iteration": 6.33337784, "memory(GiB)": 21.51, "elapsed_time": "9h 53m 53s", "remaining_time": "1h 42m 50s", "loss_scale": 1.0, "consumed_samples": 1388288, "global_step/max_steps": "5423/6362"} +{"lm loss": 4.86217499, "grad_norm": 0.29963177, "learning_rate": 8.65e-06, "elapsed_time_per_iteration": 6.57870317, "memory(GiB)": 21.51, "elapsed_time": "9h 54m 0s", "remaining_time": "1h 42m 43s", "loss_scale": 1.0, "consumed_samples": 1388544, "global_step/max_steps": "5424/6362"} +{"lm loss": 4.85421133, "grad_norm": 0.28158262, "learning_rate": 8.64e-06, "elapsed_time_per_iteration": 6.99049425, "memory(GiB)": 21.51, "elapsed_time": "9h 54m 7s", "remaining_time": "1h 42m 36s", "loss_scale": 1.0, "consumed_samples": 1388800, "global_step/max_steps": "5425/6362"} +{"lm loss": 4.89183426, "grad_norm": 0.28276071, "learning_rate": 8.63e-06, "elapsed_time_per_iteration": 6.81155324, "memory(GiB)": 21.51, "elapsed_time": "9h 54m 14s", "remaining_time": "1h 42m 30s", "loss_scale": 1.0, "consumed_samples": 1389056, "global_step/max_steps": "5426/6362"} +{"lm loss": 4.8953476, "grad_norm": 0.28343281, "learning_rate": 8.62e-06, "elapsed_time_per_iteration": 6.5636301, "memory(GiB)": 21.51, "elapsed_time": "9h 54m 20s", "remaining_time": "1h 42m 23s", "loss_scale": 1.0, "consumed_samples": 1389312, "global_step/max_steps": "5427/6362"} +{"lm loss": 4.87991238, "grad_norm": 0.29180324, "learning_rate": 8.6e-06, "elapsed_time_per_iteration": 6.4902482, "memory(GiB)": 21.51, "elapsed_time": "9h 54m 27s", "remaining_time": "1h 42m 17s", "loss_scale": 1.0, "consumed_samples": 1389568, "global_step/max_steps": "5428/6362"} +{"lm loss": 4.8801837, "grad_norm": 0.27908173, "learning_rate": 8.59e-06, "elapsed_time_per_iteration": 6.56150794, "memory(GiB)": 21.51, "elapsed_time": "9h 54m 33s", "remaining_time": "1h 42m 10s", "loss_scale": 1.0, "consumed_samples": 1389824, "global_step/max_steps": "5429/6362"} +{"lm loss": 4.8668623, "grad_norm": 0.31332943, "learning_rate": 8.58e-06, "elapsed_time_per_iteration": 6.59622455, "memory(GiB)": 21.51, "elapsed_time": "9h 54m 40s", "remaining_time": "1h 42m 4s", "loss_scale": 1.0, "consumed_samples": 1390080, "global_step/max_steps": "5430/6362"} +{"lm loss": 4.88386202, "grad_norm": 0.28914464, "learning_rate": 8.57e-06, "elapsed_time_per_iteration": 6.47420073, "memory(GiB)": 21.51, "elapsed_time": "9h 54m 46s", "remaining_time": "1h 41m 57s", "loss_scale": 1.0, "consumed_samples": 1390336, "global_step/max_steps": "5431/6362"} +{"lm loss": 4.87492704, "grad_norm": 0.2894491, "learning_rate": 8.56e-06, "elapsed_time_per_iteration": 6.65413046, "memory(GiB)": 21.51, "elapsed_time": "9h 54m 53s", "remaining_time": "1h 41m 51s", "loss_scale": 1.0, "consumed_samples": 1390592, "global_step/max_steps": "5432/6362"} +{"lm loss": 4.8547368, "grad_norm": 0.30096939, "learning_rate": 8.55e-06, "elapsed_time_per_iteration": 6.53354287, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 0s", "remaining_time": "1h 41m 44s", "loss_scale": 1.0, "consumed_samples": 1390848, "global_step/max_steps": "5433/6362"} +{"lm loss": 4.85592031, "grad_norm": 0.27987644, "learning_rate": 8.53e-06, "elapsed_time_per_iteration": 6.54812813, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 6s", "remaining_time": "1h 41m 37s", "loss_scale": 1.0, "consumed_samples": 1391104, "global_step/max_steps": "5434/6362"} +{"lm loss": 4.87136984, "grad_norm": 0.29979321, "learning_rate": 8.52e-06, "elapsed_time_per_iteration": 6.65142941, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 13s", "remaining_time": "1h 41m 31s", "loss_scale": 1.0, "consumed_samples": 1391360, "global_step/max_steps": "5435/6362"} +{"lm loss": 4.83946514, "grad_norm": 0.30144385, "learning_rate": 8.51e-06, "elapsed_time_per_iteration": 6.69402409, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 20s", "remaining_time": "1h 41m 24s", "loss_scale": 1.0, "consumed_samples": 1391616, "global_step/max_steps": "5436/6362"} +{"lm loss": 4.87601709, "grad_norm": 0.29616949, "learning_rate": 8.5e-06, "elapsed_time_per_iteration": 6.46392298, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 26s", "remaining_time": "1h 41m 18s", "loss_scale": 1.0, "consumed_samples": 1391872, "global_step/max_steps": "5437/6362"} +{"lm loss": 4.84518528, "grad_norm": 0.29586479, "learning_rate": 8.49e-06, "elapsed_time_per_iteration": 6.72377944, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 33s", "remaining_time": "1h 41m 11s", "loss_scale": 1.0, "consumed_samples": 1392128, "global_step/max_steps": "5438/6362"} +{"lm loss": 4.8532114, "grad_norm": 0.29588857, "learning_rate": 8.48e-06, "elapsed_time_per_iteration": 6.46669865, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 39s", "remaining_time": "1h 41m 5s", "loss_scale": 1.0, "consumed_samples": 1392384, "global_step/max_steps": "5439/6362"} +{"lm loss": 4.87617445, "grad_norm": 0.28761759, "learning_rate": 8.46e-06, "elapsed_time_per_iteration": 6.39074278, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 46s", "remaining_time": "1h 40m 58s", "loss_scale": 1.0, "consumed_samples": 1392640, "global_step/max_steps": "5440/6362"} +{"lm loss": 4.88268137, "grad_norm": 0.28293261, "learning_rate": 8.45e-06, "elapsed_time_per_iteration": 6.60151815, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 52s", "remaining_time": "1h 40m 51s", "loss_scale": 1.0, "consumed_samples": 1392896, "global_step/max_steps": "5441/6362"} +{"lm loss": 4.89111662, "grad_norm": 0.29999211, "learning_rate": 8.44e-06, "elapsed_time_per_iteration": 6.33703089, "memory(GiB)": 21.51, "elapsed_time": "9h 55m 58s", "remaining_time": "1h 40m 45s", "loss_scale": 1.0, "consumed_samples": 1393152, "global_step/max_steps": "5442/6362"} +{"lm loss": 4.87711191, "grad_norm": 0.30037126, "learning_rate": 8.43e-06, "elapsed_time_per_iteration": 6.62882876, "memory(GiB)": 21.51, "elapsed_time": "9h 56m 5s", "remaining_time": "1h 40m 38s", "loss_scale": 1.0, "consumed_samples": 1393408, "global_step/max_steps": "5443/6362"} +{"lm loss": 4.86733007, "grad_norm": 0.29373133, "learning_rate": 8.42e-06, "elapsed_time_per_iteration": 6.52488828, "memory(GiB)": 21.51, "elapsed_time": "9h 56m 12s", "remaining_time": "1h 40m 32s", "loss_scale": 1.0, "consumed_samples": 1393664, "global_step/max_steps": "5444/6362"} +{"lm loss": 4.86113262, "grad_norm": 0.29704082, "learning_rate": 8.41e-06, "elapsed_time_per_iteration": 6.48449254, "memory(GiB)": 21.51, "elapsed_time": "9h 56m 18s", "remaining_time": "1h 40m 25s", "loss_scale": 1.0, "consumed_samples": 1393920, "global_step/max_steps": "5445/6362"} +{"lm loss": 4.87086678, "grad_norm": 0.29738891, "learning_rate": 8.39e-06, "elapsed_time_per_iteration": 6.57879424, "memory(GiB)": 21.51, "elapsed_time": "9h 56m 25s", "remaining_time": "1h 40m 18s", "loss_scale": 1.0, "consumed_samples": 1394176, "global_step/max_steps": "5446/6362"} +{"lm loss": 4.87442589, "grad_norm": 0.27875465, "learning_rate": 8.38e-06, "elapsed_time_per_iteration": 6.70557642, "memory(GiB)": 21.51, "elapsed_time": "9h 56m 31s", "remaining_time": "1h 40m 12s", "loss_scale": 1.0, "consumed_samples": 1394432, "global_step/max_steps": "5447/6362"} +{"lm loss": 4.88695097, "grad_norm": 0.28874004, "learning_rate": 8.37e-06, "elapsed_time_per_iteration": 6.70616126, "memory(GiB)": 21.51, "elapsed_time": "9h 56m 38s", "remaining_time": "1h 40m 5s", "loss_scale": 1.0, "consumed_samples": 1394688, "global_step/max_steps": "5448/6362"} +{"lm loss": 4.88326931, "grad_norm": 0.29406556, "learning_rate": 8.36e-06, "elapsed_time_per_iteration": 6.73890924, "memory(GiB)": 21.51, "elapsed_time": "9h 56m 45s", "remaining_time": "1h 39m 59s", "loss_scale": 1.0, "consumed_samples": 1394944, "global_step/max_steps": "5449/6362"} +{"lm loss": 4.86960745, "grad_norm": 0.29644251, "learning_rate": 8.35e-06, "elapsed_time_per_iteration": 6.7597158, "memory(GiB)": 21.51, "elapsed_time": "9h 56m 52s", "remaining_time": "1h 39m 52s", "loss_scale": 1.0, "consumed_samples": 1395200, "global_step/max_steps": "5450/6362"} +{"lm loss": 4.85411978, "grad_norm": 0.27194673, "learning_rate": 8.34e-06, "elapsed_time_per_iteration": 6.69105196, "memory(GiB)": 21.51, "elapsed_time": "9h 56m 58s", "remaining_time": "1h 39m 46s", "loss_scale": 1.0, "consumed_samples": 1395456, "global_step/max_steps": "5451/6362"} +{"lm loss": 4.87257719, "grad_norm": 0.28213418, "learning_rate": 8.33e-06, "elapsed_time_per_iteration": 6.4190836, "memory(GiB)": 21.51, "elapsed_time": "9h 57m 5s", "remaining_time": "1h 39m 39s", "loss_scale": 1.0, "consumed_samples": 1395712, "global_step/max_steps": "5452/6362"} +{"lm loss": 4.88685274, "grad_norm": 0.32572201, "learning_rate": 8.31e-06, "elapsed_time_per_iteration": 6.65817571, "memory(GiB)": 21.51, "elapsed_time": "9h 57m 11s", "remaining_time": "1h 39m 33s", "loss_scale": 1.0, "consumed_samples": 1395968, "global_step/max_steps": "5453/6362"} +{"lm loss": 4.84797812, "grad_norm": 0.29895166, "learning_rate": 8.3e-06, "elapsed_time_per_iteration": 6.63661194, "memory(GiB)": 21.51, "elapsed_time": "9h 57m 18s", "remaining_time": "1h 39m 26s", "loss_scale": 1.0, "consumed_samples": 1396224, "global_step/max_steps": "5454/6362"} +{"lm loss": 4.86999655, "grad_norm": 0.29327625, "learning_rate": 8.29e-06, "elapsed_time_per_iteration": 6.66500878, "memory(GiB)": 21.51, "elapsed_time": "9h 57m 25s", "remaining_time": "1h 39m 19s", "loss_scale": 1.0, "consumed_samples": 1396480, "global_step/max_steps": "5455/6362"} +{"lm loss": 4.87863636, "grad_norm": 0.29897556, "learning_rate": 8.28e-06, "elapsed_time_per_iteration": 6.56708074, "memory(GiB)": 21.51, "elapsed_time": "9h 57m 31s", "remaining_time": "1h 39m 13s", "loss_scale": 1.0, "consumed_samples": 1396736, "global_step/max_steps": "5456/6362"} +{"lm loss": 4.86232853, "grad_norm": 0.29789126, "learning_rate": 8.27e-06, "elapsed_time_per_iteration": 6.40640426, "memory(GiB)": 21.51, "elapsed_time": "9h 57m 38s", "remaining_time": "1h 39m 6s", "loss_scale": 1.0, "consumed_samples": 1396992, "global_step/max_steps": "5457/6362"} +{"lm loss": 4.86131144, "grad_norm": 0.31181124, "learning_rate": 8.26e-06, "elapsed_time_per_iteration": 6.68442321, "memory(GiB)": 21.51, "elapsed_time": "9h 57m 44s", "remaining_time": "1h 39m 0s", "loss_scale": 1.0, "consumed_samples": 1397248, "global_step/max_steps": "5458/6362"} +{"lm loss": 4.84412384, "grad_norm": 0.30193362, "learning_rate": 8.25e-06, "elapsed_time_per_iteration": 6.45825267, "memory(GiB)": 21.51, "elapsed_time": "9h 57m 51s", "remaining_time": "1h 38m 53s", "loss_scale": 1.0, "consumed_samples": 1397504, "global_step/max_steps": "5459/6362"} +{"lm loss": 4.87509584, "grad_norm": 0.30456966, "learning_rate": 8.23e-06, "elapsed_time_per_iteration": 6.47280073, "memory(GiB)": 21.51, "elapsed_time": "9h 57m 57s", "remaining_time": "1h 38m 47s", "loss_scale": 1.0, "consumed_samples": 1397760, "global_step/max_steps": "5460/6362"} +{"lm loss": 4.87317371, "grad_norm": 0.30327863, "learning_rate": 8.22e-06, "elapsed_time_per_iteration": 6.60274959, "memory(GiB)": 21.51, "elapsed_time": "9h 58m 4s", "remaining_time": "1h 38m 40s", "loss_scale": 1.0, "consumed_samples": 1398016, "global_step/max_steps": "5461/6362"} +{"lm loss": 4.87928009, "grad_norm": 0.30278185, "learning_rate": 8.21e-06, "elapsed_time_per_iteration": 6.49155688, "memory(GiB)": 21.51, "elapsed_time": "9h 58m 10s", "remaining_time": "1h 38m 33s", "loss_scale": 1.0, "consumed_samples": 1398272, "global_step/max_steps": "5462/6362"} +{"lm loss": 4.87069798, "grad_norm": 0.30796477, "learning_rate": 8.2e-06, "elapsed_time_per_iteration": 6.54592133, "memory(GiB)": 21.51, "elapsed_time": "9h 58m 17s", "remaining_time": "1h 38m 27s", "loss_scale": 1.0, "consumed_samples": 1398528, "global_step/max_steps": "5463/6362"} +{"lm loss": 4.86573076, "grad_norm": 0.3177374, "learning_rate": 8.19e-06, "elapsed_time_per_iteration": 6.46002746, "memory(GiB)": 21.51, "elapsed_time": "9h 58m 23s", "remaining_time": "1h 38m 20s", "loss_scale": 1.0, "consumed_samples": 1398784, "global_step/max_steps": "5464/6362"} +{"lm loss": 4.8742857, "grad_norm": 0.28887314, "learning_rate": 8.18e-06, "elapsed_time_per_iteration": 6.61032319, "memory(GiB)": 21.51, "elapsed_time": "9h 58m 30s", "remaining_time": "1h 38m 14s", "loss_scale": 1.0, "consumed_samples": 1399040, "global_step/max_steps": "5465/6362"} +{"lm loss": 4.88221121, "grad_norm": 0.2882753, "learning_rate": 8.17e-06, "elapsed_time_per_iteration": 6.80117464, "memory(GiB)": 21.51, "elapsed_time": "9h 58m 37s", "remaining_time": "1h 38m 7s", "loss_scale": 1.0, "consumed_samples": 1399296, "global_step/max_steps": "5466/6362"} +{"lm loss": 4.89174557, "grad_norm": 0.28842214, "learning_rate": 8.15e-06, "elapsed_time_per_iteration": 6.46067071, "memory(GiB)": 21.51, "elapsed_time": "9h 58m 43s", "remaining_time": "1h 38m 1s", "loss_scale": 1.0, "consumed_samples": 1399552, "global_step/max_steps": "5467/6362"} +{"lm loss": 4.85931969, "grad_norm": 0.28934234, "learning_rate": 8.14e-06, "elapsed_time_per_iteration": 6.46371007, "memory(GiB)": 21.51, "elapsed_time": "9h 58m 50s", "remaining_time": "1h 37m 54s", "loss_scale": 1.0, "consumed_samples": 1399808, "global_step/max_steps": "5468/6362"} +{"lm loss": 4.86854506, "grad_norm": 0.29418385, "learning_rate": 8.13e-06, "elapsed_time_per_iteration": 6.39340973, "memory(GiB)": 21.51, "elapsed_time": "9h 58m 56s", "remaining_time": "1h 37m 47s", "loss_scale": 1.0, "consumed_samples": 1400064, "global_step/max_steps": "5469/6362"} +{"lm loss": 4.87946367, "grad_norm": 0.2886253, "learning_rate": 8.12e-06, "elapsed_time_per_iteration": 6.79472399, "memory(GiB)": 21.51, "elapsed_time": "9h 59m 3s", "remaining_time": "1h 37m 41s", "loss_scale": 1.0, "consumed_samples": 1400320, "global_step/max_steps": "5470/6362"} +{"lm loss": 4.89147091, "grad_norm": 0.27723226, "learning_rate": 8.11e-06, "elapsed_time_per_iteration": 6.82109904, "memory(GiB)": 21.51, "elapsed_time": "9h 59m 10s", "remaining_time": "1h 37m 34s", "loss_scale": 1.0, "consumed_samples": 1400576, "global_step/max_steps": "5471/6362"} +{"lm loss": 4.85550213, "grad_norm": 0.28057942, "learning_rate": 8.1e-06, "elapsed_time_per_iteration": 6.54069352, "memory(GiB)": 21.51, "elapsed_time": "9h 59m 16s", "remaining_time": "1h 37m 28s", "loss_scale": 1.0, "consumed_samples": 1400832, "global_step/max_steps": "5472/6362"} +{"lm loss": 4.86535788, "grad_norm": 0.29176688, "learning_rate": 8.09e-06, "elapsed_time_per_iteration": 6.46721029, "memory(GiB)": 21.51, "elapsed_time": "9h 59m 23s", "remaining_time": "1h 37m 21s", "loss_scale": 1.0, "consumed_samples": 1401088, "global_step/max_steps": "5473/6362"} +{"lm loss": 4.85976887, "grad_norm": 0.31501064, "learning_rate": 8.08e-06, "elapsed_time_per_iteration": 6.59235191, "memory(GiB)": 21.51, "elapsed_time": "9h 59m 29s", "remaining_time": "1h 37m 15s", "loss_scale": 1.0, "consumed_samples": 1401344, "global_step/max_steps": "5474/6362"} +{"lm loss": 4.88018417, "grad_norm": 0.27681994, "learning_rate": 8.06e-06, "elapsed_time_per_iteration": 6.49840045, "memory(GiB)": 21.51, "elapsed_time": "9h 59m 36s", "remaining_time": "1h 37m 8s", "loss_scale": 1.0, "consumed_samples": 1401600, "global_step/max_steps": "5475/6362"} +{"lm loss": 4.87022066, "grad_norm": 0.31095791, "learning_rate": 8.05e-06, "elapsed_time_per_iteration": 6.59015799, "memory(GiB)": 21.51, "elapsed_time": "9h 59m 42s", "remaining_time": "1h 37m 1s", "loss_scale": 1.0, "consumed_samples": 1401856, "global_step/max_steps": "5476/6362"} +{"lm loss": 4.89973831, "grad_norm": 0.29489064, "learning_rate": 8.04e-06, "elapsed_time_per_iteration": 6.61459327, "memory(GiB)": 21.51, "elapsed_time": "9h 59m 49s", "remaining_time": "1h 36m 55s", "loss_scale": 1.0, "consumed_samples": 1402112, "global_step/max_steps": "5477/6362"} +{"lm loss": 4.86074591, "grad_norm": 0.30183285, "learning_rate": 8.03e-06, "elapsed_time_per_iteration": 6.54816914, "memory(GiB)": 21.51, "elapsed_time": "9h 59m 56s", "remaining_time": "1h 36m 48s", "loss_scale": 1.0, "consumed_samples": 1402368, "global_step/max_steps": "5478/6362"} +{"lm loss": 4.8563385, "grad_norm": 0.28448126, "learning_rate": 8.02e-06, "elapsed_time_per_iteration": 6.58545828, "memory(GiB)": 21.51, "elapsed_time": "10h 0m 2s", "remaining_time": "1h 36m 42s", "loss_scale": 1.0, "consumed_samples": 1402624, "global_step/max_steps": "5479/6362"} +{"lm loss": 4.88121748, "grad_norm": 0.29463476, "learning_rate": 8.01e-06, "elapsed_time_per_iteration": 6.47206426, "memory(GiB)": 21.51, "elapsed_time": "10h 0m 9s", "remaining_time": "1h 36m 35s", "loss_scale": 1.0, "consumed_samples": 1402880, "global_step/max_steps": "5480/6362"} +{"lm loss": 4.88150597, "grad_norm": 0.2921336, "learning_rate": 8e-06, "elapsed_time_per_iteration": 6.63035154, "memory(GiB)": 21.51, "elapsed_time": "10h 0m 15s", "remaining_time": "1h 36m 29s", "loss_scale": 1.0, "consumed_samples": 1403136, "global_step/max_steps": "5481/6362"} +{"lm loss": 4.8686862, "grad_norm": 0.28281844, "learning_rate": 7.99e-06, "elapsed_time_per_iteration": 6.28997493, "memory(GiB)": 21.51, "elapsed_time": "10h 0m 22s", "remaining_time": "1h 36m 22s", "loss_scale": 1.0, "consumed_samples": 1403392, "global_step/max_steps": "5482/6362"} +{"lm loss": 4.87379885, "grad_norm": 0.27907851, "learning_rate": 7.97e-06, "elapsed_time_per_iteration": 6.42451453, "memory(GiB)": 21.51, "elapsed_time": "10h 0m 28s", "remaining_time": "1h 36m 15s", "loss_scale": 1.0, "consumed_samples": 1403648, "global_step/max_steps": "5483/6362"} +{"lm loss": 4.85364962, "grad_norm": 0.28342575, "learning_rate": 7.96e-06, "elapsed_time_per_iteration": 6.43699026, "memory(GiB)": 21.51, "elapsed_time": "10h 0m 34s", "remaining_time": "1h 36m 9s", "loss_scale": 1.0, "consumed_samples": 1403904, "global_step/max_steps": "5484/6362"} +{"lm loss": 4.88168383, "grad_norm": 0.27950829, "learning_rate": 7.95e-06, "elapsed_time_per_iteration": 6.33611846, "memory(GiB)": 21.51, "elapsed_time": "10h 0m 41s", "remaining_time": "1h 36m 2s", "loss_scale": 1.0, "consumed_samples": 1404160, "global_step/max_steps": "5485/6362"} +{"lm loss": 4.87899446, "grad_norm": 0.28663927, "learning_rate": 7.94e-06, "elapsed_time_per_iteration": 6.5872376, "memory(GiB)": 21.51, "elapsed_time": "10h 0m 47s", "remaining_time": "1h 35m 56s", "loss_scale": 1.0, "consumed_samples": 1404416, "global_step/max_steps": "5486/6362"} +{"lm loss": 4.84408283, "grad_norm": 0.2963464, "learning_rate": 7.93e-06, "elapsed_time_per_iteration": 6.33766651, "memory(GiB)": 21.51, "elapsed_time": "10h 0m 54s", "remaining_time": "1h 35m 49s", "loss_scale": 1.0, "consumed_samples": 1404672, "global_step/max_steps": "5487/6362"} +{"lm loss": 4.88799763, "grad_norm": 0.29544443, "learning_rate": 7.92e-06, "elapsed_time_per_iteration": 6.13812137, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 0s", "remaining_time": "1h 35m 42s", "loss_scale": 1.0, "consumed_samples": 1404928, "global_step/max_steps": "5488/6362"} +{"lm loss": 4.86693192, "grad_norm": 0.27291864, "learning_rate": 7.91e-06, "elapsed_time_per_iteration": 6.50929284, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 6s", "remaining_time": "1h 35m 36s", "loss_scale": 1.0, "consumed_samples": 1405184, "global_step/max_steps": "5489/6362"} +{"lm loss": 4.87928581, "grad_norm": 0.30168226, "learning_rate": 7.9e-06, "elapsed_time_per_iteration": 6.51212335, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 13s", "remaining_time": "1h 35m 29s", "loss_scale": 1.0, "consumed_samples": 1405440, "global_step/max_steps": "5490/6362"} +{"lm loss": 4.87742043, "grad_norm": 0.28327629, "learning_rate": 7.89e-06, "elapsed_time_per_iteration": 6.59418941, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 19s", "remaining_time": "1h 35m 23s", "loss_scale": 1.0, "consumed_samples": 1405696, "global_step/max_steps": "5491/6362"} +{"lm loss": 4.88163805, "grad_norm": 0.28121978, "learning_rate": 7.88e-06, "elapsed_time_per_iteration": 6.52816606, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 26s", "remaining_time": "1h 35m 16s", "loss_scale": 1.0, "consumed_samples": 1405952, "global_step/max_steps": "5492/6362"} +{"lm loss": 4.88616419, "grad_norm": 0.29105145, "learning_rate": 7.86e-06, "elapsed_time_per_iteration": 6.51576662, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 32s", "remaining_time": "1h 35m 9s", "loss_scale": 1.0, "consumed_samples": 1406208, "global_step/max_steps": "5493/6362"} +{"lm loss": 4.85271311, "grad_norm": 0.29778114, "learning_rate": 7.85e-06, "elapsed_time_per_iteration": 6.66891527, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 39s", "remaining_time": "1h 35m 3s", "loss_scale": 1.0, "consumed_samples": 1406464, "global_step/max_steps": "5494/6362"} +{"lm loss": 4.87704182, "grad_norm": 0.28146237, "learning_rate": 7.84e-06, "elapsed_time_per_iteration": 7.00749993, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 46s", "remaining_time": "1h 34m 56s", "loss_scale": 1.0, "consumed_samples": 1406720, "global_step/max_steps": "5495/6362"} +{"lm loss": 4.87690067, "grad_norm": 0.31079033, "learning_rate": 7.83e-06, "elapsed_time_per_iteration": 6.53670478, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 53s", "remaining_time": "1h 34m 50s", "loss_scale": 1.0, "consumed_samples": 1406976, "global_step/max_steps": "5496/6362"} +{"lm loss": 4.86994791, "grad_norm": 0.28842992, "learning_rate": 7.82e-06, "elapsed_time_per_iteration": 6.61234236, "memory(GiB)": 21.51, "elapsed_time": "10h 1m 59s", "remaining_time": "1h 34m 43s", "loss_scale": 1.0, "consumed_samples": 1407232, "global_step/max_steps": "5497/6362"} +{"lm loss": 4.88321257, "grad_norm": 0.32710299, "learning_rate": 7.81e-06, "elapsed_time_per_iteration": 6.77301884, "memory(GiB)": 21.51, "elapsed_time": "10h 2m 6s", "remaining_time": "1h 34m 37s", "loss_scale": 1.0, "consumed_samples": 1407488, "global_step/max_steps": "5498/6362"} +{"lm loss": 4.87624836, "grad_norm": 0.2924667, "learning_rate": 7.8e-06, "elapsed_time_per_iteration": 6.33840561, "memory(GiB)": 21.51, "elapsed_time": "10h 2m 12s", "remaining_time": "1h 34m 30s", "loss_scale": 1.0, "consumed_samples": 1407744, "global_step/max_steps": "5499/6362"} +{"lm loss": 4.85482407, "grad_norm": 0.284455, "learning_rate": 7.79e-06, "elapsed_time_per_iteration": 6.56230831, "memory(GiB)": 21.51, "elapsed_time": "10h 2m 19s", "remaining_time": "1h 34m 24s", "loss_scale": 1.0, "consumed_samples": 1408000, "global_step/max_steps": "5500/6362"} +{"lm loss": 4.88122225, "grad_norm": 0.29565483, "learning_rate": 7.78e-06, "elapsed_time_per_iteration": 6.23633838, "memory(GiB)": 21.51, "elapsed_time": "10h 2m 25s", "remaining_time": "1h 34m 17s", "loss_scale": 1.0, "consumed_samples": 1408256, "global_step/max_steps": "5501/6362"} +{"lm loss": 4.86741447, "grad_norm": 0.30310643, "learning_rate": 7.77e-06, "elapsed_time_per_iteration": 6.21880341, "memory(GiB)": 21.51, "elapsed_time": "10h 2m 31s", "remaining_time": "1h 34m 10s", "loss_scale": 1.0, "consumed_samples": 1408512, "global_step/max_steps": "5502/6362"} +{"lm loss": 4.85043812, "grad_norm": 0.28846404, "learning_rate": 7.75e-06, "elapsed_time_per_iteration": 6.22725129, "memory(GiB)": 21.51, "elapsed_time": "10h 2m 38s", "remaining_time": "1h 34m 4s", "loss_scale": 1.0, "consumed_samples": 1408768, "global_step/max_steps": "5503/6362"} +{"lm loss": 4.85388279, "grad_norm": 0.30476052, "learning_rate": 7.74e-06, "elapsed_time_per_iteration": 6.4350853, "memory(GiB)": 21.51, "elapsed_time": "10h 2m 44s", "remaining_time": "1h 33m 57s", "loss_scale": 1.0, "consumed_samples": 1409024, "global_step/max_steps": "5504/6362"} +{"lm loss": 4.8668499, "grad_norm": 0.29329187, "learning_rate": 7.73e-06, "elapsed_time_per_iteration": 6.48120427, "memory(GiB)": 21.51, "elapsed_time": "10h 2m 51s", "remaining_time": "1h 33m 50s", "loss_scale": 1.0, "consumed_samples": 1409280, "global_step/max_steps": "5505/6362"} +{"lm loss": 4.87506962, "grad_norm": 0.29942894, "learning_rate": 7.72e-06, "elapsed_time_per_iteration": 6.53854728, "memory(GiB)": 21.51, "elapsed_time": "10h 2m 57s", "remaining_time": "1h 33m 44s", "loss_scale": 1.0, "consumed_samples": 1409536, "global_step/max_steps": "5506/6362"} +{"lm loss": 4.84805202, "grad_norm": 0.30693921, "learning_rate": 7.71e-06, "elapsed_time_per_iteration": 6.66132927, "memory(GiB)": 21.51, "elapsed_time": "10h 3m 4s", "remaining_time": "1h 33m 37s", "loss_scale": 1.0, "consumed_samples": 1409792, "global_step/max_steps": "5507/6362"} +{"lm loss": 4.86899471, "grad_norm": 0.28495038, "learning_rate": 7.7e-06, "elapsed_time_per_iteration": 6.46144247, "memory(GiB)": 21.51, "elapsed_time": "10h 3m 10s", "remaining_time": "1h 33m 31s", "loss_scale": 1.0, "consumed_samples": 1410048, "global_step/max_steps": "5508/6362"} +{"lm loss": 4.8759613, "grad_norm": 0.2868742, "learning_rate": 7.69e-06, "elapsed_time_per_iteration": 6.32837296, "memory(GiB)": 21.51, "elapsed_time": "10h 3m 17s", "remaining_time": "1h 33m 24s", "loss_scale": 1.0, "consumed_samples": 1410304, "global_step/max_steps": "5509/6362"} +{"lm loss": 4.85985899, "grad_norm": 0.29701051, "learning_rate": 7.68e-06, "elapsed_time_per_iteration": 6.54724932, "memory(GiB)": 21.51, "elapsed_time": "10h 3m 23s", "remaining_time": "1h 33m 18s", "loss_scale": 1.0, "consumed_samples": 1410560, "global_step/max_steps": "5510/6362"} +{"lm loss": 4.86657619, "grad_norm": 0.28764677, "learning_rate": 7.67e-06, "elapsed_time_per_iteration": 6.4474988, "memory(GiB)": 21.51, "elapsed_time": "10h 3m 30s", "remaining_time": "1h 33m 11s", "loss_scale": 1.0, "consumed_samples": 1410816, "global_step/max_steps": "5511/6362"} +{"lm loss": 4.86592913, "grad_norm": 0.29163438, "learning_rate": 7.66e-06, "elapsed_time_per_iteration": 6.50218534, "memory(GiB)": 21.51, "elapsed_time": "10h 3m 36s", "remaining_time": "1h 33m 4s", "loss_scale": 1.0, "consumed_samples": 1411072, "global_step/max_steps": "5512/6362"} +{"lm loss": 4.8557353, "grad_norm": 0.27887678, "learning_rate": 7.65e-06, "elapsed_time_per_iteration": 6.57665634, "memory(GiB)": 21.51, "elapsed_time": "10h 3m 43s", "remaining_time": "1h 32m 58s", "loss_scale": 1.0, "consumed_samples": 1411328, "global_step/max_steps": "5513/6362"} +{"lm loss": 4.86885786, "grad_norm": 0.29102361, "learning_rate": 7.64e-06, "elapsed_time_per_iteration": 7.28657722, "memory(GiB)": 21.51, "elapsed_time": "10h 3m 50s", "remaining_time": "1h 32m 51s", "loss_scale": 1.0, "consumed_samples": 1411584, "global_step/max_steps": "5514/6362"} +{"lm loss": 4.88366699, "grad_norm": 0.29022467, "learning_rate": 7.63e-06, "elapsed_time_per_iteration": 6.46171737, "memory(GiB)": 21.51, "elapsed_time": "10h 3m 56s", "remaining_time": "1h 32m 45s", "loss_scale": 1.0, "consumed_samples": 1411840, "global_step/max_steps": "5515/6362"} +{"lm loss": 4.85677719, "grad_norm": 0.29004431, "learning_rate": 7.61e-06, "elapsed_time_per_iteration": 6.78346419, "memory(GiB)": 21.51, "elapsed_time": "10h 4m 3s", "remaining_time": "1h 32m 38s", "loss_scale": 1.0, "consumed_samples": 1412096, "global_step/max_steps": "5516/6362"} +{"lm loss": 4.88665867, "grad_norm": 0.28078029, "learning_rate": 7.6e-06, "elapsed_time_per_iteration": 6.57991624, "memory(GiB)": 21.51, "elapsed_time": "10h 4m 10s", "remaining_time": "1h 32m 32s", "loss_scale": 1.0, "consumed_samples": 1412352, "global_step/max_steps": "5517/6362"} +{"lm loss": 4.86312056, "grad_norm": 0.27730516, "learning_rate": 7.59e-06, "elapsed_time_per_iteration": 6.54728913, "memory(GiB)": 21.51, "elapsed_time": "10h 4m 16s", "remaining_time": "1h 32m 25s", "loss_scale": 1.0, "consumed_samples": 1412608, "global_step/max_steps": "5518/6362"} +{"lm loss": 4.84801245, "grad_norm": 0.28684732, "learning_rate": 7.58e-06, "elapsed_time_per_iteration": 6.65817904, "memory(GiB)": 21.51, "elapsed_time": "10h 4m 23s", "remaining_time": "1h 32m 19s", "loss_scale": 1.0, "consumed_samples": 1412864, "global_step/max_steps": "5519/6362"} +{"lm loss": 4.88141632, "grad_norm": 0.28452736, "learning_rate": 7.57e-06, "elapsed_time_per_iteration": 6.51314235, "memory(GiB)": 21.51, "elapsed_time": "10h 4m 29s", "remaining_time": "1h 32m 12s", "loss_scale": 1.0, "consumed_samples": 1413120, "global_step/max_steps": "5520/6362"} +{"lm loss": 4.88018084, "grad_norm": 0.29595616, "learning_rate": 7.56e-06, "elapsed_time_per_iteration": 6.78280544, "memory(GiB)": 21.51, "elapsed_time": "10h 4m 36s", "remaining_time": "1h 32m 5s", "loss_scale": 1.0, "consumed_samples": 1413376, "global_step/max_steps": "5521/6362"} +{"lm loss": 4.88211298, "grad_norm": 0.2771287, "learning_rate": 7.55e-06, "elapsed_time_per_iteration": 6.41599774, "memory(GiB)": 21.51, "elapsed_time": "10h 4m 43s", "remaining_time": "1h 31m 59s", "loss_scale": 1.0, "consumed_samples": 1413632, "global_step/max_steps": "5522/6362"} +{"lm loss": 4.87779427, "grad_norm": 0.29048061, "learning_rate": 7.54e-06, "elapsed_time_per_iteration": 6.37221551, "memory(GiB)": 21.51, "elapsed_time": "10h 4m 49s", "remaining_time": "1h 31m 52s", "loss_scale": 1.0, "consumed_samples": 1413888, "global_step/max_steps": "5523/6362"} +{"lm loss": 4.85172272, "grad_norm": 0.27917171, "learning_rate": 7.53e-06, "elapsed_time_per_iteration": 6.46798134, "memory(GiB)": 21.51, "elapsed_time": "10h 4m 55s", "remaining_time": "1h 31m 46s", "loss_scale": 1.0, "consumed_samples": 1414144, "global_step/max_steps": "5524/6362"} +{"lm loss": 4.89245892, "grad_norm": 0.28063586, "learning_rate": 7.52e-06, "elapsed_time_per_iteration": 6.52681398, "memory(GiB)": 21.51, "elapsed_time": "10h 5m 2s", "remaining_time": "1h 31m 39s", "loss_scale": 1.0, "consumed_samples": 1414400, "global_step/max_steps": "5525/6362"} +{"lm loss": 4.8677597, "grad_norm": 0.29510906, "learning_rate": 7.51e-06, "elapsed_time_per_iteration": 6.28830767, "memory(GiB)": 21.51, "elapsed_time": "10h 5m 8s", "remaining_time": "1h 31m 32s", "loss_scale": 1.0, "consumed_samples": 1414656, "global_step/max_steps": "5526/6362"} +{"lm loss": 4.85828304, "grad_norm": 0.26730022, "learning_rate": 7.5e-06, "elapsed_time_per_iteration": 6.49415493, "memory(GiB)": 21.51, "elapsed_time": "10h 5m 15s", "remaining_time": "1h 31m 26s", "loss_scale": 1.0, "consumed_samples": 1414912, "global_step/max_steps": "5527/6362"} +{"lm loss": 4.88127565, "grad_norm": 0.28809583, "learning_rate": 7.49e-06, "elapsed_time_per_iteration": 6.4737606, "memory(GiB)": 21.51, "elapsed_time": "10h 5m 21s", "remaining_time": "1h 31m 19s", "loss_scale": 1.0, "consumed_samples": 1415168, "global_step/max_steps": "5528/6362"} +{"lm loss": 4.8710804, "grad_norm": 0.29367763, "learning_rate": 7.48e-06, "elapsed_time_per_iteration": 6.51616335, "memory(GiB)": 21.51, "elapsed_time": "10h 5m 28s", "remaining_time": "1h 31m 13s", "loss_scale": 1.0, "consumed_samples": 1415424, "global_step/max_steps": "5529/6362"} +{"lm loss": 4.87397051, "grad_norm": 0.28502572, "learning_rate": 7.47e-06, "elapsed_time_per_iteration": 6.72812057, "memory(GiB)": 21.51, "elapsed_time": "10h 5m 35s", "remaining_time": "1h 31m 6s", "loss_scale": 1.0, "consumed_samples": 1415680, "global_step/max_steps": "5530/6362"} +{"lm loss": 4.87418509, "grad_norm": 0.28470567, "learning_rate": 7.45e-06, "elapsed_time_per_iteration": 6.67621326, "memory(GiB)": 21.51, "elapsed_time": "10h 5m 41s", "remaining_time": "1h 31m 0s", "loss_scale": 1.0, "consumed_samples": 1415936, "global_step/max_steps": "5531/6362"} +{"lm loss": 4.89365149, "grad_norm": 0.28188163, "learning_rate": 7.44e-06, "elapsed_time_per_iteration": 6.61546397, "memory(GiB)": 21.51, "elapsed_time": "10h 5m 48s", "remaining_time": "1h 30m 53s", "loss_scale": 1.0, "consumed_samples": 1416192, "global_step/max_steps": "5532/6362"} +{"lm loss": 4.87056065, "grad_norm": 0.28639716, "learning_rate": 7.43e-06, "elapsed_time_per_iteration": 6.53861547, "memory(GiB)": 21.51, "elapsed_time": "10h 5m 54s", "remaining_time": "1h 30m 46s", "loss_scale": 1.0, "consumed_samples": 1416448, "global_step/max_steps": "5533/6362"} +{"lm loss": 4.86572409, "grad_norm": 0.29294917, "learning_rate": 7.42e-06, "elapsed_time_per_iteration": 6.56244874, "memory(GiB)": 21.51, "elapsed_time": "10h 6m 1s", "remaining_time": "1h 30m 40s", "loss_scale": 1.0, "consumed_samples": 1416704, "global_step/max_steps": "5534/6362"} +{"lm loss": 4.86715126, "grad_norm": 0.29224226, "learning_rate": 7.41e-06, "elapsed_time_per_iteration": 6.52792192, "memory(GiB)": 21.51, "elapsed_time": "10h 6m 7s", "remaining_time": "1h 30m 33s", "loss_scale": 1.0, "consumed_samples": 1416960, "global_step/max_steps": "5535/6362"} +{"lm loss": 4.84766626, "grad_norm": 0.29007843, "learning_rate": 7.4e-06, "elapsed_time_per_iteration": 6.53621817, "memory(GiB)": 21.51, "elapsed_time": "10h 6m 14s", "remaining_time": "1h 30m 27s", "loss_scale": 1.0, "consumed_samples": 1417216, "global_step/max_steps": "5536/6362"} +{"lm loss": 4.85234213, "grad_norm": 0.28430477, "learning_rate": 7.39e-06, "elapsed_time_per_iteration": 6.49904513, "memory(GiB)": 21.51, "elapsed_time": "10h 6m 20s", "remaining_time": "1h 30m 20s", "loss_scale": 1.0, "consumed_samples": 1417472, "global_step/max_steps": "5537/6362"} +{"lm loss": 4.88313484, "grad_norm": 0.27932259, "learning_rate": 7.38e-06, "elapsed_time_per_iteration": 6.67558408, "memory(GiB)": 21.51, "elapsed_time": "10h 6m 27s", "remaining_time": "1h 30m 14s", "loss_scale": 1.0, "consumed_samples": 1417728, "global_step/max_steps": "5538/6362"} +{"lm loss": 4.87879753, "grad_norm": 0.27290627, "learning_rate": 7.37e-06, "elapsed_time_per_iteration": 6.6117599, "memory(GiB)": 21.51, "elapsed_time": "10h 6m 34s", "remaining_time": "1h 30m 7s", "loss_scale": 1.0, "consumed_samples": 1417984, "global_step/max_steps": "5539/6362"} +{"lm loss": 4.85475731, "grad_norm": 0.27720639, "learning_rate": 7.36e-06, "elapsed_time_per_iteration": 6.54558802, "memory(GiB)": 21.51, "elapsed_time": "10h 6m 40s", "remaining_time": "1h 30m 0s", "loss_scale": 1.0, "consumed_samples": 1418240, "global_step/max_steps": "5540/6362"} +{"lm loss": 4.88264799, "grad_norm": 0.27908984, "learning_rate": 7.35e-06, "elapsed_time_per_iteration": 6.64406157, "memory(GiB)": 21.51, "elapsed_time": "10h 6m 47s", "remaining_time": "1h 29m 54s", "loss_scale": 1.0, "consumed_samples": 1418496, "global_step/max_steps": "5541/6362"} +{"lm loss": 4.87806892, "grad_norm": 0.28176239, "learning_rate": 7.34e-06, "elapsed_time_per_iteration": 6.57741237, "memory(GiB)": 21.51, "elapsed_time": "10h 6m 54s", "remaining_time": "1h 29m 47s", "loss_scale": 1.0, "consumed_samples": 1418752, "global_step/max_steps": "5542/6362"} +{"lm loss": 4.8560605, "grad_norm": 0.28123689, "learning_rate": 7.33e-06, "elapsed_time_per_iteration": 6.45161366, "memory(GiB)": 21.51, "elapsed_time": "10h 7m 0s", "remaining_time": "1h 29m 41s", "loss_scale": 1.0, "consumed_samples": 1419008, "global_step/max_steps": "5543/6362"} +{"lm loss": 4.85830069, "grad_norm": 0.28420943, "learning_rate": 7.32e-06, "elapsed_time_per_iteration": 6.72505546, "memory(GiB)": 21.51, "elapsed_time": "10h 7m 7s", "remaining_time": "1h 29m 34s", "loss_scale": 1.0, "consumed_samples": 1419264, "global_step/max_steps": "5544/6362"} +{"lm loss": 4.87458134, "grad_norm": 0.29177794, "learning_rate": 7.31e-06, "elapsed_time_per_iteration": 6.38275456, "memory(GiB)": 21.51, "elapsed_time": "10h 7m 13s", "remaining_time": "1h 29m 28s", "loss_scale": 1.0, "consumed_samples": 1419520, "global_step/max_steps": "5545/6362"} +{"lm loss": 4.88492489, "grad_norm": 0.27484491, "learning_rate": 7.3e-06, "elapsed_time_per_iteration": 6.73201132, "memory(GiB)": 21.51, "elapsed_time": "10h 7m 20s", "remaining_time": "1h 29m 21s", "loss_scale": 1.0, "consumed_samples": 1419776, "global_step/max_steps": "5546/6362"} +{"lm loss": 4.860569, "grad_norm": 0.2771042, "learning_rate": 7.29e-06, "elapsed_time_per_iteration": 6.74509144, "memory(GiB)": 21.51, "elapsed_time": "10h 7m 27s", "remaining_time": "1h 29m 15s", "loss_scale": 1.0, "consumed_samples": 1420032, "global_step/max_steps": "5547/6362"} +{"lm loss": 4.88571596, "grad_norm": 0.27611127, "learning_rate": 7.28e-06, "elapsed_time_per_iteration": 7.00966454, "memory(GiB)": 21.51, "elapsed_time": "10h 7m 34s", "remaining_time": "1h 29m 8s", "loss_scale": 1.0, "consumed_samples": 1420288, "global_step/max_steps": "5548/6362"} +{"lm loss": 4.86856556, "grad_norm": 0.28454199, "learning_rate": 7.27e-06, "elapsed_time_per_iteration": 6.54355645, "memory(GiB)": 21.51, "elapsed_time": "10h 7m 40s", "remaining_time": "1h 29m 1s", "loss_scale": 1.0, "consumed_samples": 1420544, "global_step/max_steps": "5549/6362"} +{"lm loss": 4.88185549, "grad_norm": 0.28913972, "learning_rate": 7.26e-06, "elapsed_time_per_iteration": 6.56939149, "memory(GiB)": 21.51, "elapsed_time": "10h 7m 47s", "remaining_time": "1h 28m 55s", "loss_scale": 1.0, "consumed_samples": 1420800, "global_step/max_steps": "5550/6362"} +{"lm loss": 4.87053442, "grad_norm": 0.29108223, "learning_rate": 7.25e-06, "elapsed_time_per_iteration": 6.5373137, "memory(GiB)": 21.51, "elapsed_time": "10h 7m 53s", "remaining_time": "1h 28m 48s", "loss_scale": 1.0, "consumed_samples": 1421056, "global_step/max_steps": "5551/6362"} +{"lm loss": 4.84694862, "grad_norm": 0.27344501, "learning_rate": 7.24e-06, "elapsed_time_per_iteration": 6.6148088, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 0s", "remaining_time": "1h 28m 42s", "loss_scale": 1.0, "consumed_samples": 1421312, "global_step/max_steps": "5552/6362"} +{"lm loss": 4.86720943, "grad_norm": 0.28286496, "learning_rate": 7.23e-06, "elapsed_time_per_iteration": 6.72551012, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 7s", "remaining_time": "1h 28m 35s", "loss_scale": 1.0, "consumed_samples": 1421568, "global_step/max_steps": "5553/6362"} +{"lm loss": 4.8755908, "grad_norm": 0.28961417, "learning_rate": 7.22e-06, "elapsed_time_per_iteration": 6.59870362, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 13s", "remaining_time": "1h 28m 29s", "loss_scale": 1.0, "consumed_samples": 1421824, "global_step/max_steps": "5554/6362"} +{"lm loss": 4.88086939, "grad_norm": 0.27812627, "learning_rate": 7.2e-06, "elapsed_time_per_iteration": 6.68186545, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 20s", "remaining_time": "1h 28m 22s", "loss_scale": 1.0, "consumed_samples": 1422080, "global_step/max_steps": "5555/6362"} +{"lm loss": 4.86652422, "grad_norm": 0.28870127, "learning_rate": 7.19e-06, "elapsed_time_per_iteration": 6.37232852, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 26s", "remaining_time": "1h 28m 15s", "loss_scale": 1.0, "consumed_samples": 1422336, "global_step/max_steps": "5556/6362"} +{"lm loss": 4.85919428, "grad_norm": 0.28408927, "learning_rate": 7.18e-06, "elapsed_time_per_iteration": 6.64562869, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 33s", "remaining_time": "1h 28m 9s", "loss_scale": 1.0, "consumed_samples": 1422592, "global_step/max_steps": "5557/6362"} +{"lm loss": 4.86869526, "grad_norm": 0.29386473, "learning_rate": 7.17e-06, "elapsed_time_per_iteration": 6.48104048, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 39s", "remaining_time": "1h 28m 2s", "loss_scale": 1.0, "consumed_samples": 1422848, "global_step/max_steps": "5558/6362"} +{"lm loss": 4.8917737, "grad_norm": 0.28633341, "learning_rate": 7.16e-06, "elapsed_time_per_iteration": 6.61072016, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 46s", "remaining_time": "1h 27m 56s", "loss_scale": 1.0, "consumed_samples": 1423104, "global_step/max_steps": "5559/6362"} +{"lm loss": 4.85834026, "grad_norm": 0.29704452, "learning_rate": 7.15e-06, "elapsed_time_per_iteration": 6.6313324, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 53s", "remaining_time": "1h 27m 49s", "loss_scale": 1.0, "consumed_samples": 1423360, "global_step/max_steps": "5560/6362"} +{"lm loss": 4.85556316, "grad_norm": 0.29334724, "learning_rate": 7.14e-06, "elapsed_time_per_iteration": 6.62232637, "memory(GiB)": 21.51, "elapsed_time": "10h 8m 59s", "remaining_time": "1h 27m 43s", "loss_scale": 1.0, "consumed_samples": 1423616, "global_step/max_steps": "5561/6362"} +{"lm loss": 4.87626123, "grad_norm": 0.31094164, "learning_rate": 7.13e-06, "elapsed_time_per_iteration": 6.71902776, "memory(GiB)": 21.51, "elapsed_time": "10h 9m 6s", "remaining_time": "1h 27m 36s", "loss_scale": 1.0, "consumed_samples": 1423872, "global_step/max_steps": "5562/6362"} +{"lm loss": 4.84900808, "grad_norm": 0.30198491, "learning_rate": 7.12e-06, "elapsed_time_per_iteration": 6.54653668, "memory(GiB)": 21.51, "elapsed_time": "10h 9m 12s", "remaining_time": "1h 27m 30s", "loss_scale": 1.0, "consumed_samples": 1424128, "global_step/max_steps": "5563/6362"} +{"lm loss": 4.88476896, "grad_norm": 0.31576025, "learning_rate": 7.11e-06, "elapsed_time_per_iteration": 6.52518272, "memory(GiB)": 21.51, "elapsed_time": "10h 9m 19s", "remaining_time": "1h 27m 23s", "loss_scale": 1.0, "consumed_samples": 1424384, "global_step/max_steps": "5564/6362"} +{"lm loss": 4.87528706, "grad_norm": 0.29647505, "learning_rate": 7.1e-06, "elapsed_time_per_iteration": 6.75612187, "memory(GiB)": 21.51, "elapsed_time": "10h 9m 26s", "remaining_time": "1h 27m 16s", "loss_scale": 1.0, "consumed_samples": 1424640, "global_step/max_steps": "5565/6362"} +{"lm loss": 4.88735723, "grad_norm": 0.302019, "learning_rate": 7.09e-06, "elapsed_time_per_iteration": 6.56909704, "memory(GiB)": 21.51, "elapsed_time": "10h 9m 32s", "remaining_time": "1h 27m 10s", "loss_scale": 1.0, "consumed_samples": 1424896, "global_step/max_steps": "5566/6362"} +{"lm loss": 4.85732508, "grad_norm": 0.27183056, "learning_rate": 7.08e-06, "elapsed_time_per_iteration": 6.33468437, "memory(GiB)": 21.51, "elapsed_time": "10h 9m 39s", "remaining_time": "1h 27m 3s", "loss_scale": 1.0, "consumed_samples": 1425152, "global_step/max_steps": "5567/6362"} +{"lm loss": 4.87409019, "grad_norm": 0.30394867, "learning_rate": 7.07e-06, "elapsed_time_per_iteration": 6.53423715, "memory(GiB)": 21.51, "elapsed_time": "10h 9m 45s", "remaining_time": "1h 26m 57s", "loss_scale": 1.0, "consumed_samples": 1425408, "global_step/max_steps": "5568/6362"} +{"lm loss": 4.86171818, "grad_norm": 0.27716759, "learning_rate": 7.06e-06, "elapsed_time_per_iteration": 6.71944356, "memory(GiB)": 21.51, "elapsed_time": "10h 9m 52s", "remaining_time": "1h 26m 50s", "loss_scale": 1.0, "consumed_samples": 1425664, "global_step/max_steps": "5569/6362"} +{"lm loss": 4.84930038, "grad_norm": 0.30132446, "learning_rate": 7.05e-06, "elapsed_time_per_iteration": 6.8105998, "memory(GiB)": 21.51, "elapsed_time": "10h 9m 59s", "remaining_time": "1h 26m 44s", "loss_scale": 1.0, "consumed_samples": 1425920, "global_step/max_steps": "5570/6362"} +{"lm loss": 4.88038778, "grad_norm": 0.28432786, "learning_rate": 7.04e-06, "elapsed_time_per_iteration": 6.68569279, "memory(GiB)": 21.51, "elapsed_time": "10h 10m 5s", "remaining_time": "1h 26m 37s", "loss_scale": 1.0, "consumed_samples": 1426176, "global_step/max_steps": "5571/6362"} +{"lm loss": 4.86366034, "grad_norm": 0.31096876, "learning_rate": 7.03e-06, "elapsed_time_per_iteration": 6.73342419, "memory(GiB)": 21.51, "elapsed_time": "10h 10m 12s", "remaining_time": "1h 26m 30s", "loss_scale": 1.0, "consumed_samples": 1426432, "global_step/max_steps": "5572/6362"} +{"lm loss": 4.86721945, "grad_norm": 0.28428555, "learning_rate": 7.02e-06, "elapsed_time_per_iteration": 6.57870221, "memory(GiB)": 21.51, "elapsed_time": "10h 10m 19s", "remaining_time": "1h 26m 24s", "loss_scale": 1.0, "consumed_samples": 1426688, "global_step/max_steps": "5573/6362"} +{"lm loss": 4.86681175, "grad_norm": 0.28469411, "learning_rate": 7.01e-06, "elapsed_time_per_iteration": 6.83998823, "memory(GiB)": 21.51, "elapsed_time": "10h 10m 26s", "remaining_time": "1h 26m 17s", "loss_scale": 1.0, "consumed_samples": 1426944, "global_step/max_steps": "5574/6362"} +{"lm loss": 4.87123537, "grad_norm": 0.30788743, "learning_rate": 7e-06, "elapsed_time_per_iteration": 6.61896276, "memory(GiB)": 21.51, "elapsed_time": "10h 10m 32s", "remaining_time": "1h 26m 11s", "loss_scale": 1.0, "consumed_samples": 1427200, "global_step/max_steps": "5575/6362"} +{"lm loss": 4.86147499, "grad_norm": 0.29661152, "learning_rate": 6.99e-06, "elapsed_time_per_iteration": 6.77471232, "memory(GiB)": 21.51, "elapsed_time": "10h 10m 39s", "remaining_time": "1h 26m 4s", "loss_scale": 1.0, "consumed_samples": 1427456, "global_step/max_steps": "5576/6362"} +{"lm loss": 4.8855567, "grad_norm": 0.29108253, "learning_rate": 6.98e-06, "elapsed_time_per_iteration": 6.52573681, "memory(GiB)": 21.51, "elapsed_time": "10h 10m 45s", "remaining_time": "1h 25m 58s", "loss_scale": 1.0, "consumed_samples": 1427712, "global_step/max_steps": "5577/6362"} +{"lm loss": 4.85038328, "grad_norm": 0.28408375, "learning_rate": 6.97e-06, "elapsed_time_per_iteration": 6.49721289, "memory(GiB)": 21.51, "elapsed_time": "10h 10m 52s", "remaining_time": "1h 25m 51s", "loss_scale": 1.0, "consumed_samples": 1427968, "global_step/max_steps": "5578/6362"} +{"lm loss": 4.8249979, "grad_norm": 0.29218608, "learning_rate": 6.96e-06, "elapsed_time_per_iteration": 6.61360645, "memory(GiB)": 21.51, "elapsed_time": "10h 10m 59s", "remaining_time": "1h 25m 45s", "loss_scale": 1.0, "consumed_samples": 1428224, "global_step/max_steps": "5579/6362"} +{"lm loss": 4.87857723, "grad_norm": 0.28469136, "learning_rate": 6.95e-06, "elapsed_time_per_iteration": 6.61863804, "memory(GiB)": 21.51, "elapsed_time": "10h 11m 5s", "remaining_time": "1h 25m 38s", "loss_scale": 1.0, "consumed_samples": 1428480, "global_step/max_steps": "5580/6362"} +{"lm loss": 4.87743855, "grad_norm": 0.29806611, "learning_rate": 6.94e-06, "elapsed_time_per_iteration": 6.48713112, "memory(GiB)": 21.51, "elapsed_time": "10h 11m 12s", "remaining_time": "1h 25m 31s", "loss_scale": 1.0, "consumed_samples": 1428736, "global_step/max_steps": "5581/6362"} +{"lm loss": 4.87739134, "grad_norm": 0.2896896, "learning_rate": 6.93e-06, "elapsed_time_per_iteration": 6.37436819, "memory(GiB)": 21.51, "elapsed_time": "10h 11m 18s", "remaining_time": "1h 25m 25s", "loss_scale": 1.0, "consumed_samples": 1428992, "global_step/max_steps": "5582/6362"} +{"lm loss": 4.87674999, "grad_norm": 0.2863079, "learning_rate": 6.92e-06, "elapsed_time_per_iteration": 6.35207009, "memory(GiB)": 21.51, "elapsed_time": "10h 11m 24s", "remaining_time": "1h 25m 18s", "loss_scale": 1.0, "consumed_samples": 1429248, "global_step/max_steps": "5583/6362"} +{"lm loss": 4.83895159, "grad_norm": 0.2829248, "learning_rate": 6.91e-06, "elapsed_time_per_iteration": 6.4456141, "memory(GiB)": 21.51, "elapsed_time": "10h 11m 31s", "remaining_time": "1h 25m 12s", "loss_scale": 1.0, "consumed_samples": 1429504, "global_step/max_steps": "5584/6362"} +{"lm loss": 4.88090563, "grad_norm": 0.30017638, "learning_rate": 6.9e-06, "elapsed_time_per_iteration": 6.58722091, "memory(GiB)": 21.51, "elapsed_time": "10h 11m 37s", "remaining_time": "1h 25m 5s", "loss_scale": 1.0, "consumed_samples": 1429760, "global_step/max_steps": "5585/6362"} +{"lm loss": 4.88954544, "grad_norm": 0.30760664, "learning_rate": 6.89e-06, "elapsed_time_per_iteration": 6.57163763, "memory(GiB)": 21.51, "elapsed_time": "10h 11m 44s", "remaining_time": "1h 24m 58s", "loss_scale": 1.0, "consumed_samples": 1430016, "global_step/max_steps": "5586/6362"} +{"lm loss": 4.90352774, "grad_norm": 0.28433761, "learning_rate": 6.88e-06, "elapsed_time_per_iteration": 6.48079753, "memory(GiB)": 21.51, "elapsed_time": "10h 11m 51s", "remaining_time": "1h 24m 52s", "loss_scale": 1.0, "consumed_samples": 1430272, "global_step/max_steps": "5587/6362"} +{"lm loss": 4.87571955, "grad_norm": 0.31189254, "learning_rate": 6.87e-06, "elapsed_time_per_iteration": 6.49888682, "memory(GiB)": 21.51, "elapsed_time": "10h 11m 57s", "remaining_time": "1h 24m 45s", "loss_scale": 1.0, "consumed_samples": 1430528, "global_step/max_steps": "5588/6362"} +{"lm loss": 4.86174679, "grad_norm": 0.30638704, "learning_rate": 6.86e-06, "elapsed_time_per_iteration": 6.43195915, "memory(GiB)": 21.51, "elapsed_time": "10h 12m 3s", "remaining_time": "1h 24m 39s", "loss_scale": 1.0, "consumed_samples": 1430784, "global_step/max_steps": "5589/6362"} +{"lm loss": 4.87105942, "grad_norm": 0.30418894, "learning_rate": 6.85e-06, "elapsed_time_per_iteration": 6.52382016, "memory(GiB)": 21.51, "elapsed_time": "10h 12m 10s", "remaining_time": "1h 24m 32s", "loss_scale": 1.0, "consumed_samples": 1431040, "global_step/max_steps": "5590/6362"} +{"lm loss": 4.8966918, "grad_norm": 0.29977223, "learning_rate": 6.84e-06, "elapsed_time_per_iteration": 6.41896462, "memory(GiB)": 21.51, "elapsed_time": "10h 12m 16s", "remaining_time": "1h 24m 26s", "loss_scale": 1.0, "consumed_samples": 1431296, "global_step/max_steps": "5591/6362"} +{"lm loss": 4.88308287, "grad_norm": 0.30687568, "learning_rate": 6.83e-06, "elapsed_time_per_iteration": 6.64441466, "memory(GiB)": 21.51, "elapsed_time": "10h 12m 23s", "remaining_time": "1h 24m 19s", "loss_scale": 1.0, "consumed_samples": 1431552, "global_step/max_steps": "5592/6362"} +{"lm loss": 4.83723688, "grad_norm": 0.30357987, "learning_rate": 6.82e-06, "elapsed_time_per_iteration": 6.38787985, "memory(GiB)": 21.51, "elapsed_time": "10h 12m 29s", "remaining_time": "1h 24m 12s", "loss_scale": 1.0, "consumed_samples": 1431808, "global_step/max_steps": "5593/6362"} +{"lm loss": 4.87280703, "grad_norm": 0.29712132, "learning_rate": 6.81e-06, "elapsed_time_per_iteration": 6.42443848, "memory(GiB)": 21.51, "elapsed_time": "10h 12m 36s", "remaining_time": "1h 24m 6s", "loss_scale": 1.0, "consumed_samples": 1432064, "global_step/max_steps": "5594/6362"} +{"lm loss": 4.88655233, "grad_norm": 0.30597493, "learning_rate": 6.8e-06, "elapsed_time_per_iteration": 6.30138016, "memory(GiB)": 21.51, "elapsed_time": "10h 12m 42s", "remaining_time": "1h 23m 59s", "loss_scale": 1.0, "consumed_samples": 1432320, "global_step/max_steps": "5595/6362"} +{"lm loss": 4.84769487, "grad_norm": 0.28184348, "learning_rate": 6.79e-06, "elapsed_time_per_iteration": 6.79608297, "memory(GiB)": 21.51, "elapsed_time": "10h 12m 49s", "remaining_time": "1h 23m 53s", "loss_scale": 1.0, "consumed_samples": 1432576, "global_step/max_steps": "5596/6362"} +{"lm loss": 4.86487341, "grad_norm": 0.27787912, "learning_rate": 6.78e-06, "elapsed_time_per_iteration": 6.66949272, "memory(GiB)": 21.51, "elapsed_time": "10h 12m 56s", "remaining_time": "1h 23m 46s", "loss_scale": 1.0, "consumed_samples": 1432832, "global_step/max_steps": "5597/6362"} +{"lm loss": 4.87698984, "grad_norm": 0.30129886, "learning_rate": 6.77e-06, "elapsed_time_per_iteration": 6.54937983, "memory(GiB)": 21.51, "elapsed_time": "10h 13m 2s", "remaining_time": "1h 23m 39s", "loss_scale": 1.0, "consumed_samples": 1433088, "global_step/max_steps": "5598/6362"} +{"lm loss": 4.86473799, "grad_norm": 0.29070079, "learning_rate": 6.76e-06, "elapsed_time_per_iteration": 6.62472939, "memory(GiB)": 21.51, "elapsed_time": "10h 13m 9s", "remaining_time": "1h 23m 33s", "loss_scale": 1.0, "consumed_samples": 1433344, "global_step/max_steps": "5599/6362"} +{"lm loss": 4.86467361, "grad_norm": 0.29061598, "learning_rate": 6.75e-06, "elapsed_time_per_iteration": 6.56545591, "memory(GiB)": 21.51, "elapsed_time": "10h 13m 15s", "remaining_time": "1h 23m 26s", "loss_scale": 1.0, "consumed_samples": 1433600, "global_step/max_steps": "5600/6362"} +{"lm loss": 4.8494997, "grad_norm": 0.29753968, "learning_rate": 6.75e-06, "elapsed_time_per_iteration": 6.65967417, "memory(GiB)": 21.51, "elapsed_time": "10h 13m 22s", "remaining_time": "1h 23m 20s", "loss_scale": 1.0, "consumed_samples": 1433856, "global_step/max_steps": "5601/6362"} +{"lm loss": 4.87266254, "grad_norm": 0.29488775, "learning_rate": 6.74e-06, "elapsed_time_per_iteration": 6.62888479, "memory(GiB)": 21.51, "elapsed_time": "10h 13m 29s", "remaining_time": "1h 23m 13s", "loss_scale": 1.0, "consumed_samples": 1434112, "global_step/max_steps": "5602/6362"} +{"lm loss": 4.89501762, "grad_norm": 0.29906484, "learning_rate": 6.73e-06, "elapsed_time_per_iteration": 6.63207531, "memory(GiB)": 21.51, "elapsed_time": "10h 13m 35s", "remaining_time": "1h 23m 7s", "loss_scale": 1.0, "consumed_samples": 1434368, "global_step/max_steps": "5603/6362"} +{"lm loss": 4.8688941, "grad_norm": 0.28823459, "learning_rate": 6.72e-06, "elapsed_time_per_iteration": 6.45861387, "memory(GiB)": 21.51, "elapsed_time": "10h 13m 42s", "remaining_time": "1h 23m 0s", "loss_scale": 1.0, "consumed_samples": 1434624, "global_step/max_steps": "5604/6362"} +{"lm loss": 4.87228966, "grad_norm": 0.28425658, "learning_rate": 6.71e-06, "elapsed_time_per_iteration": 6.57730913, "memory(GiB)": 21.51, "elapsed_time": "10h 13m 48s", "remaining_time": "1h 22m 54s", "loss_scale": 1.0, "consumed_samples": 1434880, "global_step/max_steps": "5605/6362"} +{"lm loss": 4.88344812, "grad_norm": 0.29748341, "learning_rate": 6.7e-06, "elapsed_time_per_iteration": 6.72776008, "memory(GiB)": 21.51, "elapsed_time": "10h 13m 55s", "remaining_time": "1h 22m 47s", "loss_scale": 1.0, "consumed_samples": 1435136, "global_step/max_steps": "5606/6362"} +{"lm loss": 4.87868309, "grad_norm": 0.28988367, "learning_rate": 6.69e-06, "elapsed_time_per_iteration": 6.77715445, "memory(GiB)": 21.51, "elapsed_time": "10h 14m 2s", "remaining_time": "1h 22m 40s", "loss_scale": 1.0, "consumed_samples": 1435392, "global_step/max_steps": "5607/6362"} +{"lm loss": 4.88246346, "grad_norm": 0.29412323, "learning_rate": 6.68e-06, "elapsed_time_per_iteration": 6.71878409, "memory(GiB)": 21.51, "elapsed_time": "10h 14m 9s", "remaining_time": "1h 22m 34s", "loss_scale": 1.0, "consumed_samples": 1435648, "global_step/max_steps": "5608/6362"} +{"lm loss": 4.87065554, "grad_norm": 0.27588052, "learning_rate": 6.67e-06, "elapsed_time_per_iteration": 6.53618455, "memory(GiB)": 21.51, "elapsed_time": "10h 14m 15s", "remaining_time": "1h 22m 27s", "loss_scale": 1.0, "consumed_samples": 1435904, "global_step/max_steps": "5609/6362"} +{"lm loss": 4.87785816, "grad_norm": 0.28924745, "learning_rate": 6.66e-06, "elapsed_time_per_iteration": 6.82163572, "memory(GiB)": 21.51, "elapsed_time": "10h 14m 22s", "remaining_time": "1h 22m 21s", "loss_scale": 1.0, "consumed_samples": 1436160, "global_step/max_steps": "5610/6362"} +{"lm loss": 4.86513424, "grad_norm": 0.27806666, "learning_rate": 6.65e-06, "elapsed_time_per_iteration": 6.48557472, "memory(GiB)": 21.51, "elapsed_time": "10h 14m 28s", "remaining_time": "1h 22m 14s", "loss_scale": 1.0, "consumed_samples": 1436416, "global_step/max_steps": "5611/6362"} +{"lm loss": 4.84256124, "grad_norm": 0.28447467, "learning_rate": 6.64e-06, "elapsed_time_per_iteration": 6.78326964, "memory(GiB)": 21.51, "elapsed_time": "10h 14m 35s", "remaining_time": "1h 22m 8s", "loss_scale": 1.0, "consumed_samples": 1436672, "global_step/max_steps": "5612/6362"} +{"lm loss": 4.86859703, "grad_norm": 0.29902381, "learning_rate": 6.63e-06, "elapsed_time_per_iteration": 6.63858128, "memory(GiB)": 21.51, "elapsed_time": "10h 14m 42s", "remaining_time": "1h 22m 1s", "loss_scale": 1.0, "consumed_samples": 1436928, "global_step/max_steps": "5613/6362"} +{"lm loss": 4.87914705, "grad_norm": 0.3012045, "learning_rate": 6.62e-06, "elapsed_time_per_iteration": 6.64116931, "memory(GiB)": 21.51, "elapsed_time": "10h 14m 48s", "remaining_time": "1h 21m 55s", "loss_scale": 1.0, "consumed_samples": 1437184, "global_step/max_steps": "5614/6362"} +{"lm loss": 4.85825777, "grad_norm": 0.26707929, "learning_rate": 6.61e-06, "elapsed_time_per_iteration": 6.70410228, "memory(GiB)": 21.51, "elapsed_time": "10h 14m 55s", "remaining_time": "1h 21m 48s", "loss_scale": 1.0, "consumed_samples": 1437440, "global_step/max_steps": "5615/6362"} +{"lm loss": 4.87397528, "grad_norm": 0.28516978, "learning_rate": 6.6e-06, "elapsed_time_per_iteration": 6.87766671, "memory(GiB)": 21.51, "elapsed_time": "10h 15m 2s", "remaining_time": "1h 21m 41s", "loss_scale": 1.0, "consumed_samples": 1437696, "global_step/max_steps": "5616/6362"} +{"lm loss": 4.87942457, "grad_norm": 0.28943276, "learning_rate": 6.59e-06, "elapsed_time_per_iteration": 6.57384944, "memory(GiB)": 21.51, "elapsed_time": "10h 15m 9s", "remaining_time": "1h 21m 35s", "loss_scale": 1.0, "consumed_samples": 1437952, "global_step/max_steps": "5617/6362"} +{"lm loss": 4.86299086, "grad_norm": 0.28333032, "learning_rate": 6.58e-06, "elapsed_time_per_iteration": 6.75665069, "memory(GiB)": 21.51, "elapsed_time": "10h 15m 15s", "remaining_time": "1h 21m 28s", "loss_scale": 1.0, "consumed_samples": 1438208, "global_step/max_steps": "5618/6362"} +{"lm loss": 4.88345575, "grad_norm": 0.27772924, "learning_rate": 6.57e-06, "elapsed_time_per_iteration": 6.44107604, "memory(GiB)": 21.51, "elapsed_time": "10h 15m 22s", "remaining_time": "1h 21m 22s", "loss_scale": 1.0, "consumed_samples": 1438464, "global_step/max_steps": "5619/6362"} +{"lm loss": 4.86700773, "grad_norm": 0.27252522, "learning_rate": 6.56e-06, "elapsed_time_per_iteration": 6.34684873, "memory(GiB)": 21.51, "elapsed_time": "10h 15m 28s", "remaining_time": "1h 21m 15s", "loss_scale": 1.0, "consumed_samples": 1438720, "global_step/max_steps": "5620/6362"} +{"lm loss": 4.86367369, "grad_norm": 0.29756123, "learning_rate": 6.55e-06, "elapsed_time_per_iteration": 6.5208683, "memory(GiB)": 21.51, "elapsed_time": "10h 15m 35s", "remaining_time": "1h 21m 9s", "loss_scale": 1.0, "consumed_samples": 1438976, "global_step/max_steps": "5621/6362"} +{"lm loss": 4.87132072, "grad_norm": 0.28885618, "learning_rate": 6.54e-06, "elapsed_time_per_iteration": 6.63374543, "memory(GiB)": 21.51, "elapsed_time": "10h 15m 41s", "remaining_time": "1h 21m 2s", "loss_scale": 1.0, "consumed_samples": 1439232, "global_step/max_steps": "5622/6362"} +{"lm loss": 4.84977293, "grad_norm": 0.28566283, "learning_rate": 6.53e-06, "elapsed_time_per_iteration": 6.73763394, "memory(GiB)": 21.51, "elapsed_time": "10h 15m 48s", "remaining_time": "1h 20m 55s", "loss_scale": 1.0, "consumed_samples": 1439488, "global_step/max_steps": "5623/6362"} +{"lm loss": 4.86684704, "grad_norm": 0.29527986, "learning_rate": 6.52e-06, "elapsed_time_per_iteration": 6.46968889, "memory(GiB)": 21.51, "elapsed_time": "10h 15m 55s", "remaining_time": "1h 20m 49s", "loss_scale": 1.0, "consumed_samples": 1439744, "global_step/max_steps": "5624/6362"} +{"lm loss": 4.84490681, "grad_norm": 0.28056559, "learning_rate": 6.52e-06, "elapsed_time_per_iteration": 6.52901196, "memory(GiB)": 21.51, "elapsed_time": "10h 16m 1s", "remaining_time": "1h 20m 42s", "loss_scale": 1.0, "consumed_samples": 1440000, "global_step/max_steps": "5625/6362"} +{"lm loss": 4.86765194, "grad_norm": 0.2853353, "learning_rate": 6.51e-06, "elapsed_time_per_iteration": 6.76634216, "memory(GiB)": 21.51, "elapsed_time": "10h 16m 8s", "remaining_time": "1h 20m 36s", "loss_scale": 1.0, "consumed_samples": 1440256, "global_step/max_steps": "5626/6362"} +{"lm loss": 4.87318611, "grad_norm": 0.29934517, "learning_rate": 6.5e-06, "elapsed_time_per_iteration": 6.69602418, "memory(GiB)": 21.51, "elapsed_time": "10h 16m 14s", "remaining_time": "1h 20m 29s", "loss_scale": 1.0, "consumed_samples": 1440512, "global_step/max_steps": "5627/6362"} +{"lm loss": 4.91275263, "grad_norm": 0.26778746, "learning_rate": 6.49e-06, "elapsed_time_per_iteration": 6.65088344, "memory(GiB)": 21.51, "elapsed_time": "10h 16m 21s", "remaining_time": "1h 20m 23s", "loss_scale": 1.0, "consumed_samples": 1440768, "global_step/max_steps": "5628/6362"} +{"lm loss": 4.86428738, "grad_norm": 0.27862567, "learning_rate": 6.48e-06, "elapsed_time_per_iteration": 6.38231969, "memory(GiB)": 21.51, "elapsed_time": "10h 16m 28s", "remaining_time": "1h 20m 16s", "loss_scale": 1.0, "consumed_samples": 1441024, "global_step/max_steps": "5629/6362"} +{"lm loss": 4.87473679, "grad_norm": 0.30613169, "learning_rate": 6.47e-06, "elapsed_time_per_iteration": 6.64213753, "memory(GiB)": 21.51, "elapsed_time": "10h 16m 34s", "remaining_time": "1h 20m 9s", "loss_scale": 1.0, "consumed_samples": 1441280, "global_step/max_steps": "5630/6362"} +{"lm loss": 4.88781309, "grad_norm": 0.28794396, "learning_rate": 6.46e-06, "elapsed_time_per_iteration": 6.42436528, "memory(GiB)": 21.51, "elapsed_time": "10h 16m 41s", "remaining_time": "1h 20m 3s", "loss_scale": 1.0, "consumed_samples": 1441536, "global_step/max_steps": "5631/6362"} +{"lm loss": 4.85136509, "grad_norm": 0.29418242, "learning_rate": 6.45e-06, "elapsed_time_per_iteration": 6.59651899, "memory(GiB)": 21.51, "elapsed_time": "10h 16m 47s", "remaining_time": "1h 19m 56s", "loss_scale": 1.0, "consumed_samples": 1441792, "global_step/max_steps": "5632/6362"} +{"lm loss": 4.88303375, "grad_norm": 0.27912599, "learning_rate": 6.44e-06, "elapsed_time_per_iteration": 6.41194367, "memory(GiB)": 21.51, "elapsed_time": "10h 16m 54s", "remaining_time": "1h 19m 50s", "loss_scale": 1.0, "consumed_samples": 1442048, "global_step/max_steps": "5633/6362"} +{"lm loss": 4.8809309, "grad_norm": 0.27156001, "learning_rate": 6.43e-06, "elapsed_time_per_iteration": 6.5066998, "memory(GiB)": 21.51, "elapsed_time": "10h 17m 0s", "remaining_time": "1h 19m 43s", "loss_scale": 1.0, "consumed_samples": 1442304, "global_step/max_steps": "5634/6362"} +{"lm loss": 4.8594842, "grad_norm": 0.30044854, "learning_rate": 6.42e-06, "elapsed_time_per_iteration": 6.47810721, "memory(GiB)": 21.51, "elapsed_time": "10h 17m 7s", "remaining_time": "1h 19m 37s", "loss_scale": 1.0, "consumed_samples": 1442560, "global_step/max_steps": "5635/6362"} +{"lm loss": 4.8553133, "grad_norm": 0.26664212, "learning_rate": 6.41e-06, "elapsed_time_per_iteration": 6.63670945, "memory(GiB)": 21.51, "elapsed_time": "10h 17m 13s", "remaining_time": "1h 19m 30s", "loss_scale": 1.0, "consumed_samples": 1442816, "global_step/max_steps": "5636/6362"} +{"lm loss": 4.89380026, "grad_norm": 0.29117858, "learning_rate": 6.4e-06, "elapsed_time_per_iteration": 6.60978603, "memory(GiB)": 21.51, "elapsed_time": "10h 17m 20s", "remaining_time": "1h 19m 23s", "loss_scale": 1.0, "consumed_samples": 1443072, "global_step/max_steps": "5637/6362"} +{"lm loss": 4.87683582, "grad_norm": 0.2874102, "learning_rate": 6.39e-06, "elapsed_time_per_iteration": 6.6091392, "memory(GiB)": 21.51, "elapsed_time": "10h 17m 26s", "remaining_time": "1h 19m 17s", "loss_scale": 1.0, "consumed_samples": 1443328, "global_step/max_steps": "5638/6362"} +{"lm loss": 4.85600185, "grad_norm": 0.28210631, "learning_rate": 6.38e-06, "elapsed_time_per_iteration": 6.85712814, "memory(GiB)": 21.51, "elapsed_time": "10h 17m 33s", "remaining_time": "1h 19m 10s", "loss_scale": 1.0, "consumed_samples": 1443584, "global_step/max_steps": "5639/6362"} +{"lm loss": 4.87797403, "grad_norm": 0.28590345, "learning_rate": 6.38e-06, "elapsed_time_per_iteration": 6.56812787, "memory(GiB)": 21.51, "elapsed_time": "10h 17m 40s", "remaining_time": "1h 19m 4s", "loss_scale": 1.0, "consumed_samples": 1443840, "global_step/max_steps": "5640/6362"} +{"lm loss": 4.84341669, "grad_norm": 0.27512541, "learning_rate": 6.37e-06, "elapsed_time_per_iteration": 6.63306236, "memory(GiB)": 21.51, "elapsed_time": "10h 17m 46s", "remaining_time": "1h 18m 57s", "loss_scale": 1.0, "consumed_samples": 1444096, "global_step/max_steps": "5641/6362"} +{"lm loss": 4.86963701, "grad_norm": 0.28516462, "learning_rate": 6.36e-06, "elapsed_time_per_iteration": 6.52587461, "memory(GiB)": 21.51, "elapsed_time": "10h 17m 53s", "remaining_time": "1h 18m 51s", "loss_scale": 1.0, "consumed_samples": 1444352, "global_step/max_steps": "5642/6362"} +{"lm loss": 4.86035252, "grad_norm": 0.32665667, "learning_rate": 6.35e-06, "elapsed_time_per_iteration": 6.49652338, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 0s", "remaining_time": "1h 18m 44s", "loss_scale": 1.0, "consumed_samples": 1444608, "global_step/max_steps": "5643/6362"} +{"lm loss": 4.88066149, "grad_norm": 0.27040747, "learning_rate": 6.34e-06, "elapsed_time_per_iteration": 6.45267153, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 6s", "remaining_time": "1h 18m 37s", "loss_scale": 1.0, "consumed_samples": 1444864, "global_step/max_steps": "5644/6362"} +{"lm loss": 4.86496878, "grad_norm": 0.30132136, "learning_rate": 6.33e-06, "elapsed_time_per_iteration": 6.65641761, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 13s", "remaining_time": "1h 18m 31s", "loss_scale": 1.0, "consumed_samples": 1445120, "global_step/max_steps": "5645/6362"} +{"lm loss": 4.87104177, "grad_norm": 0.31103492, "learning_rate": 6.32e-06, "elapsed_time_per_iteration": 6.63710546, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 19s", "remaining_time": "1h 18m 24s", "loss_scale": 1.0, "consumed_samples": 1445376, "global_step/max_steps": "5646/6362"} +{"lm loss": 4.86163855, "grad_norm": 0.27467445, "learning_rate": 6.31e-06, "elapsed_time_per_iteration": 6.69471717, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 26s", "remaining_time": "1h 18m 18s", "loss_scale": 1.0, "consumed_samples": 1445632, "global_step/max_steps": "5647/6362"} +{"lm loss": 4.8450985, "grad_norm": 0.3080174, "learning_rate": 6.3e-06, "elapsed_time_per_iteration": 6.92465019, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 33s", "remaining_time": "1h 18m 11s", "loss_scale": 1.0, "consumed_samples": 1445888, "global_step/max_steps": "5648/6362"} +{"lm loss": 4.85907269, "grad_norm": 0.29563487, "learning_rate": 6.29e-06, "elapsed_time_per_iteration": 6.46934557, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 39s", "remaining_time": "1h 18m 5s", "loss_scale": 1.0, "consumed_samples": 1446144, "global_step/max_steps": "5649/6362"} +{"lm loss": 4.87892628, "grad_norm": 0.27190521, "learning_rate": 6.28e-06, "elapsed_time_per_iteration": 6.58561254, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 46s", "remaining_time": "1h 17m 58s", "loss_scale": 1.0, "consumed_samples": 1446400, "global_step/max_steps": "5650/6362"} +{"lm loss": 4.85399532, "grad_norm": 0.28033969, "learning_rate": 6.27e-06, "elapsed_time_per_iteration": 6.39910626, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 52s", "remaining_time": "1h 17m 51s", "loss_scale": 1.0, "consumed_samples": 1446656, "global_step/max_steps": "5651/6362"} +{"lm loss": 4.89732456, "grad_norm": 0.26968673, "learning_rate": 6.27e-06, "elapsed_time_per_iteration": 6.48033714, "memory(GiB)": 21.51, "elapsed_time": "10h 18m 59s", "remaining_time": "1h 17m 45s", "loss_scale": 1.0, "consumed_samples": 1446912, "global_step/max_steps": "5652/6362"} +{"lm loss": 4.87850046, "grad_norm": 0.27871263, "learning_rate": 6.26e-06, "elapsed_time_per_iteration": 6.55313182, "memory(GiB)": 21.51, "elapsed_time": "10h 19m 5s", "remaining_time": "1h 17m 38s", "loss_scale": 1.0, "consumed_samples": 1447168, "global_step/max_steps": "5653/6362"} +{"lm loss": 4.87502098, "grad_norm": 0.2770637, "learning_rate": 6.25e-06, "elapsed_time_per_iteration": 6.78169394, "memory(GiB)": 21.51, "elapsed_time": "10h 19m 12s", "remaining_time": "1h 17m 32s", "loss_scale": 1.0, "consumed_samples": 1447424, "global_step/max_steps": "5654/6362"} +{"lm loss": 4.86588192, "grad_norm": 0.2704756, "learning_rate": 6.24e-06, "elapsed_time_per_iteration": 6.66884351, "memory(GiB)": 21.51, "elapsed_time": "10h 19m 19s", "remaining_time": "1h 17m 25s", "loss_scale": 1.0, "consumed_samples": 1447680, "global_step/max_steps": "5655/6362"} +{"lm loss": 4.89297247, "grad_norm": 0.27343079, "learning_rate": 6.23e-06, "elapsed_time_per_iteration": 6.55250931, "memory(GiB)": 21.51, "elapsed_time": "10h 19m 25s", "remaining_time": "1h 17m 19s", "loss_scale": 1.0, "consumed_samples": 1447936, "global_step/max_steps": "5656/6362"} +{"lm loss": 4.86675787, "grad_norm": 0.26654425, "learning_rate": 6.22e-06, "elapsed_time_per_iteration": 6.67071843, "memory(GiB)": 21.51, "elapsed_time": "10h 19m 32s", "remaining_time": "1h 17m 12s", "loss_scale": 1.0, "consumed_samples": 1448192, "global_step/max_steps": "5657/6362"} +{"lm loss": 4.85960913, "grad_norm": 0.27119255, "learning_rate": 6.21e-06, "elapsed_time_per_iteration": 6.48229361, "memory(GiB)": 21.51, "elapsed_time": "10h 19m 39s", "remaining_time": "1h 17m 6s", "loss_scale": 1.0, "consumed_samples": 1448448, "global_step/max_steps": "5658/6362"} +{"lm loss": 4.87432814, "grad_norm": 0.27864599, "learning_rate": 6.2e-06, "elapsed_time_per_iteration": 6.6438334, "memory(GiB)": 21.51, "elapsed_time": "10h 19m 45s", "remaining_time": "1h 16m 59s", "loss_scale": 1.0, "consumed_samples": 1448704, "global_step/max_steps": "5659/6362"} +{"lm loss": 4.87695599, "grad_norm": 0.28177378, "learning_rate": 6.19e-06, "elapsed_time_per_iteration": 6.5230298, "memory(GiB)": 21.51, "elapsed_time": "10h 19m 52s", "remaining_time": "1h 16m 52s", "loss_scale": 1.0, "consumed_samples": 1448960, "global_step/max_steps": "5660/6362"} +{"lm loss": 4.87458038, "grad_norm": 0.28025603, "learning_rate": 6.18e-06, "elapsed_time_per_iteration": 6.65446711, "memory(GiB)": 21.51, "elapsed_time": "10h 19m 58s", "remaining_time": "1h 16m 46s", "loss_scale": 1.0, "consumed_samples": 1449216, "global_step/max_steps": "5661/6362"} +{"lm loss": 4.87872124, "grad_norm": 0.26666704, "learning_rate": 6.18e-06, "elapsed_time_per_iteration": 6.49741483, "memory(GiB)": 21.51, "elapsed_time": "10h 20m 5s", "remaining_time": "1h 16m 39s", "loss_scale": 1.0, "consumed_samples": 1449472, "global_step/max_steps": "5662/6362"} +{"lm loss": 4.86090183, "grad_norm": 0.28192517, "learning_rate": 6.17e-06, "elapsed_time_per_iteration": 6.7125535, "memory(GiB)": 21.51, "elapsed_time": "10h 20m 12s", "remaining_time": "1h 16m 33s", "loss_scale": 1.0, "consumed_samples": 1449728, "global_step/max_steps": "5663/6362"} +{"lm loss": 4.86971474, "grad_norm": 0.2690751, "learning_rate": 6.16e-06, "elapsed_time_per_iteration": 6.5978477, "memory(GiB)": 21.51, "elapsed_time": "10h 20m 18s", "remaining_time": "1h 16m 26s", "loss_scale": 1.0, "consumed_samples": 1449984, "global_step/max_steps": "5664/6362"} +{"lm loss": 4.88830042, "grad_norm": 0.26227874, "learning_rate": 6.15e-06, "elapsed_time_per_iteration": 6.56914592, "memory(GiB)": 21.51, "elapsed_time": "10h 20m 25s", "remaining_time": "1h 16m 20s", "loss_scale": 1.0, "consumed_samples": 1450240, "global_step/max_steps": "5665/6362"} +{"lm loss": 4.86828279, "grad_norm": 0.28601617, "learning_rate": 6.14e-06, "elapsed_time_per_iteration": 6.5864532, "memory(GiB)": 21.51, "elapsed_time": "10h 20m 31s", "remaining_time": "1h 16m 13s", "loss_scale": 1.0, "consumed_samples": 1450496, "global_step/max_steps": "5666/6362"} +{"lm loss": 4.85634422, "grad_norm": 0.29001886, "learning_rate": 6.13e-06, "elapsed_time_per_iteration": 6.60821486, "memory(GiB)": 21.51, "elapsed_time": "10h 20m 38s", "remaining_time": "1h 16m 6s", "loss_scale": 1.0, "consumed_samples": 1450752, "global_step/max_steps": "5667/6362"} +{"lm loss": 4.86264563, "grad_norm": 0.26757428, "learning_rate": 6.12e-06, "elapsed_time_per_iteration": 6.54927874, "memory(GiB)": 21.51, "elapsed_time": "10h 20m 44s", "remaining_time": "1h 16m 0s", "loss_scale": 1.0, "consumed_samples": 1451008, "global_step/max_steps": "5668/6362"} +{"lm loss": 4.86007261, "grad_norm": 0.27043518, "learning_rate": 6.11e-06, "elapsed_time_per_iteration": 6.59035707, "memory(GiB)": 21.51, "elapsed_time": "10h 20m 51s", "remaining_time": "1h 15m 53s", "loss_scale": 1.0, "consumed_samples": 1451264, "global_step/max_steps": "5669/6362"} +{"lm loss": 4.8586092, "grad_norm": 0.27426755, "learning_rate": 6.1e-06, "elapsed_time_per_iteration": 6.60205579, "memory(GiB)": 21.51, "elapsed_time": "10h 20m 58s", "remaining_time": "1h 15m 47s", "loss_scale": 1.0, "consumed_samples": 1451520, "global_step/max_steps": "5670/6362"} +{"lm loss": 4.86795378, "grad_norm": 0.28159806, "learning_rate": 6.09e-06, "elapsed_time_per_iteration": 6.46253061, "memory(GiB)": 21.51, "elapsed_time": "10h 21m 4s", "remaining_time": "1h 15m 40s", "loss_scale": 1.0, "consumed_samples": 1451776, "global_step/max_steps": "5671/6362"} +{"lm loss": 4.8578701, "grad_norm": 0.2962752, "learning_rate": 6.09e-06, "elapsed_time_per_iteration": 6.52340102, "memory(GiB)": 21.51, "elapsed_time": "10h 21m 11s", "remaining_time": "1h 15m 34s", "loss_scale": 1.0, "consumed_samples": 1452032, "global_step/max_steps": "5672/6362"} +{"lm loss": 4.89682865, "grad_norm": 0.29210514, "learning_rate": 6.08e-06, "elapsed_time_per_iteration": 6.55020046, "memory(GiB)": 21.51, "elapsed_time": "10h 21m 17s", "remaining_time": "1h 15m 27s", "loss_scale": 1.0, "consumed_samples": 1452288, "global_step/max_steps": "5673/6362"} +{"lm loss": 4.88327551, "grad_norm": 0.2868818, "learning_rate": 6.07e-06, "elapsed_time_per_iteration": 6.6297121, "memory(GiB)": 21.51, "elapsed_time": "10h 21m 24s", "remaining_time": "1h 15m 20s", "loss_scale": 1.0, "consumed_samples": 1452544, "global_step/max_steps": "5674/6362"} +{"lm loss": 4.86103201, "grad_norm": 0.28937882, "learning_rate": 6.06e-06, "elapsed_time_per_iteration": 6.50817776, "memory(GiB)": 21.51, "elapsed_time": "10h 21m 30s", "remaining_time": "1h 15m 14s", "loss_scale": 1.0, "consumed_samples": 1452800, "global_step/max_steps": "5675/6362"} +{"lm loss": 4.85957193, "grad_norm": 0.29063162, "learning_rate": 6.05e-06, "elapsed_time_per_iteration": 6.66824841, "memory(GiB)": 21.51, "elapsed_time": "10h 21m 37s", "remaining_time": "1h 15m 7s", "loss_scale": 1.0, "consumed_samples": 1453056, "global_step/max_steps": "5676/6362"} +{"lm loss": 4.86934471, "grad_norm": 0.2743012, "learning_rate": 6.04e-06, "elapsed_time_per_iteration": 6.62785387, "memory(GiB)": 21.51, "elapsed_time": "10h 21m 44s", "remaining_time": "1h 15m 1s", "loss_scale": 1.0, "consumed_samples": 1453312, "global_step/max_steps": "5677/6362"} +{"lm loss": 4.85464287, "grad_norm": 0.29422674, "learning_rate": 6.03e-06, "elapsed_time_per_iteration": 6.33853912, "memory(GiB)": 21.51, "elapsed_time": "10h 21m 50s", "remaining_time": "1h 14m 54s", "loss_scale": 1.0, "consumed_samples": 1453568, "global_step/max_steps": "5678/6362"} +{"lm loss": 4.84884691, "grad_norm": 0.28139582, "learning_rate": 6.02e-06, "elapsed_time_per_iteration": 6.44567323, "memory(GiB)": 21.51, "elapsed_time": "10h 21m 56s", "remaining_time": "1h 14m 48s", "loss_scale": 1.0, "consumed_samples": 1453824, "global_step/max_steps": "5679/6362"} +{"lm loss": 4.87562084, "grad_norm": 0.26669648, "learning_rate": 6.02e-06, "elapsed_time_per_iteration": 6.43710518, "memory(GiB)": 21.51, "elapsed_time": "10h 22m 3s", "remaining_time": "1h 14m 41s", "loss_scale": 1.0, "consumed_samples": 1454080, "global_step/max_steps": "5680/6362"} +{"lm loss": 4.85642862, "grad_norm": 0.28798085, "learning_rate": 6.01e-06, "elapsed_time_per_iteration": 6.62677407, "memory(GiB)": 21.51, "elapsed_time": "10h 22m 9s", "remaining_time": "1h 14m 34s", "loss_scale": 1.0, "consumed_samples": 1454336, "global_step/max_steps": "5681/6362"} +{"lm loss": 4.86685228, "grad_norm": 0.26871225, "learning_rate": 6e-06, "elapsed_time_per_iteration": 6.69412708, "memory(GiB)": 21.51, "elapsed_time": "10h 22m 16s", "remaining_time": "1h 14m 28s", "loss_scale": 1.0, "consumed_samples": 1454592, "global_step/max_steps": "5682/6362"} +{"lm loss": 4.89640236, "grad_norm": 0.2624121, "learning_rate": 5.99e-06, "elapsed_time_per_iteration": 6.52159071, "memory(GiB)": 21.51, "elapsed_time": "10h 22m 23s", "remaining_time": "1h 14m 21s", "loss_scale": 1.0, "consumed_samples": 1454848, "global_step/max_steps": "5683/6362"} +{"lm loss": 4.84990644, "grad_norm": 0.27956098, "learning_rate": 5.98e-06, "elapsed_time_per_iteration": 6.69815636, "memory(GiB)": 21.51, "elapsed_time": "10h 22m 29s", "remaining_time": "1h 14m 15s", "loss_scale": 1.0, "consumed_samples": 1455104, "global_step/max_steps": "5684/6362"} +{"lm loss": 4.88011551, "grad_norm": 0.28535366, "learning_rate": 5.97e-06, "elapsed_time_per_iteration": 6.51789713, "memory(GiB)": 21.51, "elapsed_time": "10h 22m 36s", "remaining_time": "1h 14m 8s", "loss_scale": 1.0, "consumed_samples": 1455360, "global_step/max_steps": "5685/6362"} +{"lm loss": 4.84755516, "grad_norm": 0.27140969, "learning_rate": 5.96e-06, "elapsed_time_per_iteration": 6.69660115, "memory(GiB)": 21.51, "elapsed_time": "10h 22m 43s", "remaining_time": "1h 14m 2s", "loss_scale": 1.0, "consumed_samples": 1455616, "global_step/max_steps": "5686/6362"} +{"lm loss": 4.85998774, "grad_norm": 0.29304892, "learning_rate": 5.95e-06, "elapsed_time_per_iteration": 6.62304521, "memory(GiB)": 21.51, "elapsed_time": "10h 22m 49s", "remaining_time": "1h 13m 55s", "loss_scale": 1.0, "consumed_samples": 1455872, "global_step/max_steps": "5687/6362"} +{"lm loss": 4.8600831, "grad_norm": 0.27376395, "learning_rate": 5.95e-06, "elapsed_time_per_iteration": 6.53226781, "memory(GiB)": 21.51, "elapsed_time": "10h 22m 56s", "remaining_time": "1h 13m 48s", "loss_scale": 1.0, "consumed_samples": 1456128, "global_step/max_steps": "5688/6362"} +{"lm loss": 4.86174345, "grad_norm": 0.27215549, "learning_rate": 5.94e-06, "elapsed_time_per_iteration": 6.49487901, "memory(GiB)": 21.51, "elapsed_time": "10h 23m 2s", "remaining_time": "1h 13m 42s", "loss_scale": 1.0, "consumed_samples": 1456384, "global_step/max_steps": "5689/6362"} +{"lm loss": 4.88476849, "grad_norm": 0.27479708, "learning_rate": 5.93e-06, "elapsed_time_per_iteration": 6.84660268, "memory(GiB)": 21.51, "elapsed_time": "10h 23m 9s", "remaining_time": "1h 13m 35s", "loss_scale": 1.0, "consumed_samples": 1456640, "global_step/max_steps": "5690/6362"} +{"lm loss": 4.88190699, "grad_norm": 0.27237678, "learning_rate": 5.92e-06, "elapsed_time_per_iteration": 6.67910838, "memory(GiB)": 21.51, "elapsed_time": "10h 23m 16s", "remaining_time": "1h 13m 29s", "loss_scale": 1.0, "consumed_samples": 1456896, "global_step/max_steps": "5691/6362"} +{"lm loss": 4.86453915, "grad_norm": 0.28055283, "learning_rate": 5.91e-06, "elapsed_time_per_iteration": 6.61625385, "memory(GiB)": 21.51, "elapsed_time": "10h 23m 22s", "remaining_time": "1h 13m 22s", "loss_scale": 1.0, "consumed_samples": 1457152, "global_step/max_steps": "5692/6362"} +{"lm loss": 4.85319805, "grad_norm": 0.27967989, "learning_rate": 5.9e-06, "elapsed_time_per_iteration": 6.76738429, "memory(GiB)": 21.51, "elapsed_time": "10h 23m 29s", "remaining_time": "1h 13m 16s", "loss_scale": 1.0, "consumed_samples": 1457408, "global_step/max_steps": "5693/6362"} +{"lm loss": 4.85687637, "grad_norm": 0.29413471, "learning_rate": 5.89e-06, "elapsed_time_per_iteration": 6.5376029, "memory(GiB)": 21.51, "elapsed_time": "10h 23m 36s", "remaining_time": "1h 13m 9s", "loss_scale": 1.0, "consumed_samples": 1457664, "global_step/max_steps": "5694/6362"} +{"lm loss": 4.83740234, "grad_norm": 0.27640855, "learning_rate": 5.89e-06, "elapsed_time_per_iteration": 6.46763492, "memory(GiB)": 21.51, "elapsed_time": "10h 23m 42s", "remaining_time": "1h 13m 2s", "loss_scale": 1.0, "consumed_samples": 1457920, "global_step/max_steps": "5695/6362"} +{"lm loss": 4.88139248, "grad_norm": 0.30113253, "learning_rate": 5.88e-06, "elapsed_time_per_iteration": 6.47309136, "memory(GiB)": 21.51, "elapsed_time": "10h 23m 49s", "remaining_time": "1h 12m 56s", "loss_scale": 1.0, "consumed_samples": 1458176, "global_step/max_steps": "5696/6362"} +{"lm loss": 4.86671734, "grad_norm": 0.26475987, "learning_rate": 5.87e-06, "elapsed_time_per_iteration": 6.44613075, "memory(GiB)": 21.51, "elapsed_time": "10h 23m 55s", "remaining_time": "1h 12m 49s", "loss_scale": 1.0, "consumed_samples": 1458432, "global_step/max_steps": "5697/6362"} +{"lm loss": 4.86899614, "grad_norm": 0.28837609, "learning_rate": 5.86e-06, "elapsed_time_per_iteration": 6.47093034, "memory(GiB)": 21.51, "elapsed_time": "10h 24m 2s", "remaining_time": "1h 12m 43s", "loss_scale": 1.0, "consumed_samples": 1458688, "global_step/max_steps": "5698/6362"} +{"lm loss": 4.86880636, "grad_norm": 0.29115975, "learning_rate": 5.85e-06, "elapsed_time_per_iteration": 6.68398714, "memory(GiB)": 21.51, "elapsed_time": "10h 24m 8s", "remaining_time": "1h 12m 36s", "loss_scale": 1.0, "consumed_samples": 1458944, "global_step/max_steps": "5699/6362"} +{"lm loss": 4.87041855, "grad_norm": 0.27737549, "learning_rate": 5.84e-06, "elapsed_time_per_iteration": 6.42806005, "memory(GiB)": 21.51, "elapsed_time": "10h 24m 15s", "remaining_time": "1h 12m 30s", "loss_scale": 1.0, "consumed_samples": 1459200, "global_step/max_steps": "5700/6362"} +{"lm loss": 4.85705662, "grad_norm": 0.29187083, "learning_rate": 5.83e-06, "elapsed_time_per_iteration": 6.58936429, "memory(GiB)": 21.51, "elapsed_time": "10h 24m 21s", "remaining_time": "1h 12m 23s", "loss_scale": 1.0, "consumed_samples": 1459456, "global_step/max_steps": "5701/6362"} +{"lm loss": 4.88509941, "grad_norm": 0.27274513, "learning_rate": 5.83e-06, "elapsed_time_per_iteration": 6.66395092, "memory(GiB)": 21.51, "elapsed_time": "10h 24m 28s", "remaining_time": "1h 12m 16s", "loss_scale": 1.0, "consumed_samples": 1459712, "global_step/max_steps": "5702/6362"} +{"lm loss": 4.89004707, "grad_norm": 0.27121422, "learning_rate": 5.82e-06, "elapsed_time_per_iteration": 6.3847568, "memory(GiB)": 21.51, "elapsed_time": "10h 24m 34s", "remaining_time": "1h 12m 10s", "loss_scale": 1.0, "consumed_samples": 1459968, "global_step/max_steps": "5703/6362"} +{"lm loss": 4.87141275, "grad_norm": 0.2906743, "learning_rate": 5.81e-06, "elapsed_time_per_iteration": 6.35178256, "memory(GiB)": 21.51, "elapsed_time": "10h 24m 41s", "remaining_time": "1h 12m 3s", "loss_scale": 1.0, "consumed_samples": 1460224, "global_step/max_steps": "5704/6362"} +{"lm loss": 4.85946083, "grad_norm": 0.28901789, "learning_rate": 5.8e-06, "elapsed_time_per_iteration": 6.51791191, "memory(GiB)": 21.51, "elapsed_time": "10h 24m 47s", "remaining_time": "1h 11m 57s", "loss_scale": 1.0, "consumed_samples": 1460480, "global_step/max_steps": "5705/6362"} +{"lm loss": 4.86592913, "grad_norm": 0.26675254, "learning_rate": 5.79e-06, "elapsed_time_per_iteration": 6.76465821, "memory(GiB)": 21.51, "elapsed_time": "10h 24m 54s", "remaining_time": "1h 11m 50s", "loss_scale": 1.0, "consumed_samples": 1460736, "global_step/max_steps": "5706/6362"} +{"lm loss": 4.86651421, "grad_norm": 0.26210067, "learning_rate": 5.78e-06, "elapsed_time_per_iteration": 6.89087939, "memory(GiB)": 21.51, "elapsed_time": "10h 25m 1s", "remaining_time": "1h 11m 44s", "loss_scale": 1.0, "consumed_samples": 1460992, "global_step/max_steps": "5707/6362"} +{"lm loss": 4.88782501, "grad_norm": 0.30205315, "learning_rate": 5.78e-06, "elapsed_time_per_iteration": 6.55280972, "memory(GiB)": 21.51, "elapsed_time": "10h 25m 7s", "remaining_time": "1h 11m 37s", "loss_scale": 1.0, "consumed_samples": 1461248, "global_step/max_steps": "5708/6362"} +{"lm loss": 4.89069319, "grad_norm": 0.28488192, "learning_rate": 5.77e-06, "elapsed_time_per_iteration": 6.64515209, "memory(GiB)": 21.51, "elapsed_time": "10h 25m 14s", "remaining_time": "1h 11m 30s", "loss_scale": 1.0, "consumed_samples": 1461504, "global_step/max_steps": "5709/6362"} +{"lm loss": 4.87001753, "grad_norm": 0.28799355, "learning_rate": 5.76e-06, "elapsed_time_per_iteration": 6.44399786, "memory(GiB)": 21.51, "elapsed_time": "10h 25m 20s", "remaining_time": "1h 11m 24s", "loss_scale": 1.0, "consumed_samples": 1461760, "global_step/max_steps": "5710/6362"} +{"lm loss": 4.89930487, "grad_norm": 0.28377607, "learning_rate": 5.75e-06, "elapsed_time_per_iteration": 6.77839494, "memory(GiB)": 21.51, "elapsed_time": "10h 25m 27s", "remaining_time": "1h 11m 17s", "loss_scale": 1.0, "consumed_samples": 1462016, "global_step/max_steps": "5711/6362"} +{"lm loss": 4.85144711, "grad_norm": 0.2898542, "learning_rate": 5.74e-06, "elapsed_time_per_iteration": 6.57581615, "memory(GiB)": 21.51, "elapsed_time": "10h 25m 34s", "remaining_time": "1h 11m 11s", "loss_scale": 1.0, "consumed_samples": 1462272, "global_step/max_steps": "5712/6362"} +{"lm loss": 4.88803339, "grad_norm": 0.29481828, "learning_rate": 5.73e-06, "elapsed_time_per_iteration": 6.65125728, "memory(GiB)": 21.51, "elapsed_time": "10h 25m 40s", "remaining_time": "1h 11m 4s", "loss_scale": 1.0, "consumed_samples": 1462528, "global_step/max_steps": "5713/6362"} +{"lm loss": 4.85826683, "grad_norm": 0.27571115, "learning_rate": 5.73e-06, "elapsed_time_per_iteration": 6.36144686, "memory(GiB)": 21.51, "elapsed_time": "10h 25m 47s", "remaining_time": "1h 10m 58s", "loss_scale": 1.0, "consumed_samples": 1462784, "global_step/max_steps": "5714/6362"} +{"lm loss": 4.87100267, "grad_norm": 0.27828327, "learning_rate": 5.72e-06, "elapsed_time_per_iteration": 6.51334715, "memory(GiB)": 21.51, "elapsed_time": "10h 25m 53s", "remaining_time": "1h 10m 51s", "loss_scale": 1.0, "consumed_samples": 1463040, "global_step/max_steps": "5715/6362"} +{"lm loss": 4.8611927, "grad_norm": 0.27991173, "learning_rate": 5.71e-06, "elapsed_time_per_iteration": 6.77429318, "memory(GiB)": 21.51, "elapsed_time": "10h 26m 0s", "remaining_time": "1h 10m 44s", "loss_scale": 1.0, "consumed_samples": 1463296, "global_step/max_steps": "5716/6362"} +{"lm loss": 4.87947321, "grad_norm": 0.27427319, "learning_rate": 5.7e-06, "elapsed_time_per_iteration": 6.63840485, "memory(GiB)": 21.51, "elapsed_time": "10h 26m 7s", "remaining_time": "1h 10m 38s", "loss_scale": 1.0, "consumed_samples": 1463552, "global_step/max_steps": "5717/6362"} +{"lm loss": 4.85054016, "grad_norm": 0.26939917, "learning_rate": 5.69e-06, "elapsed_time_per_iteration": 6.53656745, "memory(GiB)": 21.51, "elapsed_time": "10h 26m 13s", "remaining_time": "1h 10m 31s", "loss_scale": 1.0, "consumed_samples": 1463808, "global_step/max_steps": "5718/6362"} +{"lm loss": 4.88204908, "grad_norm": 0.30014381, "learning_rate": 5.68e-06, "elapsed_time_per_iteration": 6.55365038, "memory(GiB)": 21.51, "elapsed_time": "10h 26m 20s", "remaining_time": "1h 10m 25s", "loss_scale": 1.0, "consumed_samples": 1464064, "global_step/max_steps": "5719/6362"} +{"lm loss": 4.84231663, "grad_norm": 0.28171879, "learning_rate": 5.68e-06, "elapsed_time_per_iteration": 6.43923664, "memory(GiB)": 21.51, "elapsed_time": "10h 26m 26s", "remaining_time": "1h 10m 18s", "loss_scale": 1.0, "consumed_samples": 1464320, "global_step/max_steps": "5720/6362"} +{"lm loss": 4.89140463, "grad_norm": 0.26015028, "learning_rate": 5.67e-06, "elapsed_time_per_iteration": 6.75332713, "memory(GiB)": 21.51, "elapsed_time": "10h 26m 33s", "remaining_time": "1h 10m 12s", "loss_scale": 1.0, "consumed_samples": 1464576, "global_step/max_steps": "5721/6362"} +{"lm loss": 4.86611271, "grad_norm": 0.26361632, "learning_rate": 5.66e-06, "elapsed_time_per_iteration": 6.52550077, "memory(GiB)": 21.51, "elapsed_time": "10h 26m 40s", "remaining_time": "1h 10m 5s", "loss_scale": 1.0, "consumed_samples": 1464832, "global_step/max_steps": "5722/6362"} +{"lm loss": 4.86587, "grad_norm": 0.28614512, "learning_rate": 5.65e-06, "elapsed_time_per_iteration": 6.56512356, "memory(GiB)": 21.51, "elapsed_time": "10h 26m 46s", "remaining_time": "1h 9m 58s", "loss_scale": 1.0, "consumed_samples": 1465088, "global_step/max_steps": "5723/6362"} +{"lm loss": 4.87869453, "grad_norm": 0.27033782, "learning_rate": 5.64e-06, "elapsed_time_per_iteration": 6.88342953, "memory(GiB)": 21.51, "elapsed_time": "10h 26m 53s", "remaining_time": "1h 9m 52s", "loss_scale": 1.0, "consumed_samples": 1465344, "global_step/max_steps": "5724/6362"} +{"lm loss": 4.86344481, "grad_norm": 0.27970228, "learning_rate": 5.63e-06, "elapsed_time_per_iteration": 6.58987641, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 0s", "remaining_time": "1h 9m 45s", "loss_scale": 1.0, "consumed_samples": 1465600, "global_step/max_steps": "5725/6362"} +{"lm loss": 4.87471676, "grad_norm": 0.29304767, "learning_rate": 5.63e-06, "elapsed_time_per_iteration": 6.83780789, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 6s", "remaining_time": "1h 9m 39s", "loss_scale": 1.0, "consumed_samples": 1465856, "global_step/max_steps": "5726/6362"} +{"lm loss": 4.86508703, "grad_norm": 0.27574086, "learning_rate": 5.62e-06, "elapsed_time_per_iteration": 6.48277497, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 13s", "remaining_time": "1h 9m 32s", "loss_scale": 1.0, "consumed_samples": 1466112, "global_step/max_steps": "5727/6362"} +{"lm loss": 4.86833286, "grad_norm": 0.27997386, "learning_rate": 5.61e-06, "elapsed_time_per_iteration": 6.73535275, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 20s", "remaining_time": "1h 9m 26s", "loss_scale": 1.0, "consumed_samples": 1466368, "global_step/max_steps": "5728/6362"} +{"lm loss": 4.86109686, "grad_norm": 0.28637758, "learning_rate": 5.6e-06, "elapsed_time_per_iteration": 6.50466251, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 26s", "remaining_time": "1h 9m 19s", "loss_scale": 1.0, "consumed_samples": 1466624, "global_step/max_steps": "5729/6362"} +{"lm loss": 4.85342169, "grad_norm": 0.26069754, "learning_rate": 5.59e-06, "elapsed_time_per_iteration": 6.65188575, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 33s", "remaining_time": "1h 9m 13s", "loss_scale": 1.0, "consumed_samples": 1466880, "global_step/max_steps": "5730/6362"} +{"lm loss": 4.8763752, "grad_norm": 0.26641011, "learning_rate": 5.59e-06, "elapsed_time_per_iteration": 6.34901261, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 39s", "remaining_time": "1h 9m 6s", "loss_scale": 1.0, "consumed_samples": 1467136, "global_step/max_steps": "5731/6362"} +{"lm loss": 4.86372471, "grad_norm": 0.28090194, "learning_rate": 5.58e-06, "elapsed_time_per_iteration": 6.59489131, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 46s", "remaining_time": "1h 8m 59s", "loss_scale": 1.0, "consumed_samples": 1467392, "global_step/max_steps": "5732/6362"} +{"lm loss": 4.86789083, "grad_norm": 0.27641094, "learning_rate": 5.57e-06, "elapsed_time_per_iteration": 6.81012607, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 53s", "remaining_time": "1h 8m 53s", "loss_scale": 1.0, "consumed_samples": 1467648, "global_step/max_steps": "5733/6362"} +{"lm loss": 4.8808794, "grad_norm": 0.27198276, "learning_rate": 5.56e-06, "elapsed_time_per_iteration": 6.73864532, "memory(GiB)": 21.51, "elapsed_time": "10h 27m 59s", "remaining_time": "1h 8m 46s", "loss_scale": 1.0, "consumed_samples": 1467904, "global_step/max_steps": "5734/6362"} +{"lm loss": 4.87884092, "grad_norm": 0.27547157, "learning_rate": 5.55e-06, "elapsed_time_per_iteration": 6.68442464, "memory(GiB)": 21.51, "elapsed_time": "10h 28m 6s", "remaining_time": "1h 8m 40s", "loss_scale": 1.0, "consumed_samples": 1468160, "global_step/max_steps": "5735/6362"} +{"lm loss": 4.84791946, "grad_norm": 0.27826533, "learning_rate": 5.55e-06, "elapsed_time_per_iteration": 6.64147353, "memory(GiB)": 21.51, "elapsed_time": "10h 28m 13s", "remaining_time": "1h 8m 33s", "loss_scale": 1.0, "consumed_samples": 1468416, "global_step/max_steps": "5736/6362"} +{"lm loss": 4.88658857, "grad_norm": 0.28274962, "learning_rate": 5.54e-06, "elapsed_time_per_iteration": 6.44406915, "memory(GiB)": 21.51, "elapsed_time": "10h 28m 19s", "remaining_time": "1h 8m 27s", "loss_scale": 1.0, "consumed_samples": 1468672, "global_step/max_steps": "5737/6362"} +{"lm loss": 4.87269211, "grad_norm": 0.27946496, "learning_rate": 5.53e-06, "elapsed_time_per_iteration": 6.41258574, "memory(GiB)": 21.51, "elapsed_time": "10h 28m 26s", "remaining_time": "1h 8m 20s", "loss_scale": 1.0, "consumed_samples": 1468928, "global_step/max_steps": "5738/6362"} +{"lm loss": 4.8323431, "grad_norm": 0.27004951, "learning_rate": 5.52e-06, "elapsed_time_per_iteration": 6.66690779, "memory(GiB)": 21.51, "elapsed_time": "10h 28m 32s", "remaining_time": "1h 8m 13s", "loss_scale": 1.0, "consumed_samples": 1469184, "global_step/max_steps": "5739/6362"} +{"lm loss": 4.86065865, "grad_norm": 0.2655187, "learning_rate": 5.51e-06, "elapsed_time_per_iteration": 6.66512752, "memory(GiB)": 21.51, "elapsed_time": "10h 28m 39s", "remaining_time": "1h 8m 7s", "loss_scale": 1.0, "consumed_samples": 1469440, "global_step/max_steps": "5740/6362"} +{"lm loss": 4.87955284, "grad_norm": 0.28030857, "learning_rate": 5.5e-06, "elapsed_time_per_iteration": 6.57119393, "memory(GiB)": 21.51, "elapsed_time": "10h 28m 45s", "remaining_time": "1h 8m 0s", "loss_scale": 1.0, "consumed_samples": 1469696, "global_step/max_steps": "5741/6362"} +{"lm loss": 4.85492611, "grad_norm": 0.28189656, "learning_rate": 5.5e-06, "elapsed_time_per_iteration": 6.48931527, "memory(GiB)": 21.51, "elapsed_time": "10h 28m 52s", "remaining_time": "1h 7m 54s", "loss_scale": 1.0, "consumed_samples": 1469952, "global_step/max_steps": "5742/6362"} +{"lm loss": 4.8954711, "grad_norm": 0.27334148, "learning_rate": 5.49e-06, "elapsed_time_per_iteration": 6.41248822, "memory(GiB)": 21.51, "elapsed_time": "10h 28m 58s", "remaining_time": "1h 7m 47s", "loss_scale": 1.0, "consumed_samples": 1470208, "global_step/max_steps": "5743/6362"} +{"lm loss": 4.86082411, "grad_norm": 0.2970005, "learning_rate": 5.48e-06, "elapsed_time_per_iteration": 6.69808602, "memory(GiB)": 21.51, "elapsed_time": "10h 29m 5s", "remaining_time": "1h 7m 41s", "loss_scale": 1.0, "consumed_samples": 1470464, "global_step/max_steps": "5744/6362"} +{"lm loss": 4.8873086, "grad_norm": 0.28478098, "learning_rate": 5.47e-06, "elapsed_time_per_iteration": 6.53783798, "memory(GiB)": 21.51, "elapsed_time": "10h 29m 12s", "remaining_time": "1h 7m 34s", "loss_scale": 1.0, "consumed_samples": 1470720, "global_step/max_steps": "5745/6362"} +{"lm loss": 4.88952112, "grad_norm": 0.28013802, "learning_rate": 5.47e-06, "elapsed_time_per_iteration": 6.60531187, "memory(GiB)": 21.51, "elapsed_time": "10h 29m 18s", "remaining_time": "1h 7m 27s", "loss_scale": 1.0, "consumed_samples": 1470976, "global_step/max_steps": "5746/6362"} +{"lm loss": 4.84749699, "grad_norm": 0.27536586, "learning_rate": 5.46e-06, "elapsed_time_per_iteration": 6.49451351, "memory(GiB)": 21.51, "elapsed_time": "10h 29m 25s", "remaining_time": "1h 7m 21s", "loss_scale": 1.0, "consumed_samples": 1471232, "global_step/max_steps": "5747/6362"} +{"lm loss": 4.88168859, "grad_norm": 0.29003364, "learning_rate": 5.45e-06, "elapsed_time_per_iteration": 6.6681273, "memory(GiB)": 21.51, "elapsed_time": "10h 29m 31s", "remaining_time": "1h 7m 14s", "loss_scale": 1.0, "consumed_samples": 1471488, "global_step/max_steps": "5748/6362"} +{"lm loss": 4.87731361, "grad_norm": 0.28821933, "learning_rate": 5.44e-06, "elapsed_time_per_iteration": 6.64072227, "memory(GiB)": 21.51, "elapsed_time": "10h 29m 38s", "remaining_time": "1h 7m 8s", "loss_scale": 1.0, "consumed_samples": 1471744, "global_step/max_steps": "5749/6362"} +{"lm loss": 4.86941862, "grad_norm": 0.27953544, "learning_rate": 5.43e-06, "elapsed_time_per_iteration": 6.63166142, "memory(GiB)": 21.51, "elapsed_time": "10h 29m 45s", "remaining_time": "1h 7m 1s", "loss_scale": 1.0, "consumed_samples": 1472000, "global_step/max_steps": "5750/6362"} +{"lm loss": 4.86696005, "grad_norm": 0.28618559, "learning_rate": 5.43e-06, "elapsed_time_per_iteration": 6.54638577, "memory(GiB)": 21.51, "elapsed_time": "10h 29m 51s", "remaining_time": "1h 6m 55s", "loss_scale": 1.0, "consumed_samples": 1472256, "global_step/max_steps": "5751/6362"} +{"lm loss": 4.8619585, "grad_norm": 0.27502587, "learning_rate": 5.42e-06, "elapsed_time_per_iteration": 6.5089767, "memory(GiB)": 21.51, "elapsed_time": "10h 29m 58s", "remaining_time": "1h 6m 48s", "loss_scale": 1.0, "consumed_samples": 1472512, "global_step/max_steps": "5752/6362"} +{"lm loss": 4.87379265, "grad_norm": 0.27947494, "learning_rate": 5.41e-06, "elapsed_time_per_iteration": 6.61534023, "memory(GiB)": 21.51, "elapsed_time": "10h 30m 4s", "remaining_time": "1h 6m 41s", "loss_scale": 1.0, "consumed_samples": 1472768, "global_step/max_steps": "5753/6362"} +{"lm loss": 4.86139202, "grad_norm": 0.27985623, "learning_rate": 5.4e-06, "elapsed_time_per_iteration": 6.58037853, "memory(GiB)": 21.51, "elapsed_time": "10h 30m 11s", "remaining_time": "1h 6m 35s", "loss_scale": 1.0, "consumed_samples": 1473024, "global_step/max_steps": "5754/6362"} +{"lm loss": 4.86046457, "grad_norm": 0.28919736, "learning_rate": 5.39e-06, "elapsed_time_per_iteration": 6.57088327, "memory(GiB)": 21.51, "elapsed_time": "10h 30m 17s", "remaining_time": "1h 6m 28s", "loss_scale": 1.0, "consumed_samples": 1473280, "global_step/max_steps": "5755/6362"} +{"lm loss": 4.87227392, "grad_norm": 0.2870774, "learning_rate": 5.39e-06, "elapsed_time_per_iteration": 6.56188059, "memory(GiB)": 21.51, "elapsed_time": "10h 30m 24s", "remaining_time": "1h 6m 22s", "loss_scale": 1.0, "consumed_samples": 1473536, "global_step/max_steps": "5756/6362"} +{"lm loss": 4.85993433, "grad_norm": 0.2699528, "learning_rate": 5.38e-06, "elapsed_time_per_iteration": 6.51229811, "memory(GiB)": 21.51, "elapsed_time": "10h 30m 30s", "remaining_time": "1h 6m 15s", "loss_scale": 1.0, "consumed_samples": 1473792, "global_step/max_steps": "5757/6362"} +{"lm loss": 4.848773, "grad_norm": 0.27809045, "learning_rate": 5.37e-06, "elapsed_time_per_iteration": 6.42324543, "memory(GiB)": 21.51, "elapsed_time": "10h 30m 37s", "remaining_time": "1h 6m 9s", "loss_scale": 1.0, "consumed_samples": 1474048, "global_step/max_steps": "5758/6362"} +{"lm loss": 4.87251139, "grad_norm": 0.28883952, "learning_rate": 5.36e-06, "elapsed_time_per_iteration": 6.61480474, "memory(GiB)": 21.51, "elapsed_time": "10h 30m 44s", "remaining_time": "1h 6m 2s", "loss_scale": 1.0, "consumed_samples": 1474304, "global_step/max_steps": "5759/6362"} +{"lm loss": 4.86284256, "grad_norm": 0.26161548, "learning_rate": 5.36e-06, "elapsed_time_per_iteration": 6.54873061, "memory(GiB)": 21.51, "elapsed_time": "10h 30m 50s", "remaining_time": "1h 5m 55s", "loss_scale": 1.0, "consumed_samples": 1474560, "global_step/max_steps": "5760/6362"} +{"lm loss": 4.87621355, "grad_norm": 0.2735709, "learning_rate": 5.35e-06, "elapsed_time_per_iteration": 6.40075469, "memory(GiB)": 21.51, "elapsed_time": "10h 30m 56s", "remaining_time": "1h 5m 49s", "loss_scale": 1.0, "consumed_samples": 1474816, "global_step/max_steps": "5761/6362"} +{"lm loss": 4.86832047, "grad_norm": 0.27095461, "learning_rate": 5.34e-06, "elapsed_time_per_iteration": 6.72426558, "memory(GiB)": 21.51, "elapsed_time": "10h 31m 3s", "remaining_time": "1h 5m 42s", "loss_scale": 1.0, "consumed_samples": 1475072, "global_step/max_steps": "5762/6362"} +{"lm loss": 4.87940598, "grad_norm": 0.27172023, "learning_rate": 5.33e-06, "elapsed_time_per_iteration": 6.4143405, "memory(GiB)": 21.51, "elapsed_time": "10h 31m 10s", "remaining_time": "1h 5m 36s", "loss_scale": 1.0, "consumed_samples": 1475328, "global_step/max_steps": "5763/6362"} +{"lm loss": 4.86599302, "grad_norm": 0.2782118, "learning_rate": 5.32e-06, "elapsed_time_per_iteration": 6.42951894, "memory(GiB)": 21.51, "elapsed_time": "10h 31m 16s", "remaining_time": "1h 5m 29s", "loss_scale": 1.0, "consumed_samples": 1475584, "global_step/max_steps": "5764/6362"} +{"lm loss": 4.84632397, "grad_norm": 0.26343918, "learning_rate": 5.32e-06, "elapsed_time_per_iteration": 6.61108613, "memory(GiB)": 21.51, "elapsed_time": "10h 31m 23s", "remaining_time": "1h 5m 23s", "loss_scale": 1.0, "consumed_samples": 1475840, "global_step/max_steps": "5765/6362"} +{"lm loss": 4.87114286, "grad_norm": 0.27353284, "learning_rate": 5.31e-06, "elapsed_time_per_iteration": 6.39321876, "memory(GiB)": 21.51, "elapsed_time": "10h 31m 29s", "remaining_time": "1h 5m 16s", "loss_scale": 1.0, "consumed_samples": 1476096, "global_step/max_steps": "5766/6362"} +{"lm loss": 4.86598873, "grad_norm": 0.28629968, "learning_rate": 5.3e-06, "elapsed_time_per_iteration": 6.30588007, "memory(GiB)": 21.51, "elapsed_time": "10h 31m 35s", "remaining_time": "1h 5m 9s", "loss_scale": 1.0, "consumed_samples": 1476352, "global_step/max_steps": "5767/6362"} +{"lm loss": 4.8602891, "grad_norm": 0.27137536, "learning_rate": 5.29e-06, "elapsed_time_per_iteration": 6.28016472, "memory(GiB)": 21.51, "elapsed_time": "10h 31m 42s", "remaining_time": "1h 5m 3s", "loss_scale": 1.0, "consumed_samples": 1476608, "global_step/max_steps": "5768/6362"} +{"lm loss": 4.88688517, "grad_norm": 0.26163173, "learning_rate": 5.29e-06, "elapsed_time_per_iteration": 6.44257593, "memory(GiB)": 21.51, "elapsed_time": "10h 31m 48s", "remaining_time": "1h 4m 56s", "loss_scale": 1.0, "consumed_samples": 1476864, "global_step/max_steps": "5769/6362"} +{"lm loss": 4.88030005, "grad_norm": 0.28291002, "learning_rate": 5.28e-06, "elapsed_time_per_iteration": 6.5115304, "memory(GiB)": 21.51, "elapsed_time": "10h 31m 55s", "remaining_time": "1h 4m 50s", "loss_scale": 1.0, "consumed_samples": 1477120, "global_step/max_steps": "5770/6362"} +{"lm loss": 4.86742258, "grad_norm": 0.27332246, "learning_rate": 5.27e-06, "elapsed_time_per_iteration": 6.55150056, "memory(GiB)": 21.51, "elapsed_time": "10h 32m 1s", "remaining_time": "1h 4m 43s", "loss_scale": 1.0, "consumed_samples": 1477376, "global_step/max_steps": "5771/6362"} +{"lm loss": 4.88657427, "grad_norm": 0.27639827, "learning_rate": 5.26e-06, "elapsed_time_per_iteration": 6.52980876, "memory(GiB)": 21.51, "elapsed_time": "10h 32m 8s", "remaining_time": "1h 4m 36s", "loss_scale": 1.0, "consumed_samples": 1477632, "global_step/max_steps": "5772/6362"} +{"lm loss": 4.86159515, "grad_norm": 0.27359298, "learning_rate": 5.26e-06, "elapsed_time_per_iteration": 6.36823344, "memory(GiB)": 21.51, "elapsed_time": "10h 32m 14s", "remaining_time": "1h 4m 30s", "loss_scale": 1.0, "consumed_samples": 1477888, "global_step/max_steps": "5773/6362"} +{"lm loss": 4.88963509, "grad_norm": 0.26719546, "learning_rate": 5.25e-06, "elapsed_time_per_iteration": 6.53460193, "memory(GiB)": 21.51, "elapsed_time": "10h 32m 21s", "remaining_time": "1h 4m 23s", "loss_scale": 1.0, "consumed_samples": 1478144, "global_step/max_steps": "5774/6362"} +{"lm loss": 4.89387369, "grad_norm": 0.26926818, "learning_rate": 5.24e-06, "elapsed_time_per_iteration": 6.50610495, "memory(GiB)": 21.51, "elapsed_time": "10h 32m 27s", "remaining_time": "1h 4m 17s", "loss_scale": 1.0, "consumed_samples": 1478400, "global_step/max_steps": "5775/6362"} +{"lm loss": 4.87135935, "grad_norm": 0.27298138, "learning_rate": 5.23e-06, "elapsed_time_per_iteration": 6.42568707, "memory(GiB)": 21.51, "elapsed_time": "10h 32m 34s", "remaining_time": "1h 4m 10s", "loss_scale": 1.0, "consumed_samples": 1478656, "global_step/max_steps": "5776/6362"} +{"lm loss": 4.87544632, "grad_norm": 0.27933443, "learning_rate": 5.23e-06, "elapsed_time_per_iteration": 6.56000328, "memory(GiB)": 21.51, "elapsed_time": "10h 32m 40s", "remaining_time": "1h 4m 4s", "loss_scale": 1.0, "consumed_samples": 1478912, "global_step/max_steps": "5777/6362"} +{"lm loss": 4.85971165, "grad_norm": 0.27267009, "learning_rate": 5.22e-06, "elapsed_time_per_iteration": 6.93508196, "memory(GiB)": 21.51, "elapsed_time": "10h 32m 47s", "remaining_time": "1h 3m 57s", "loss_scale": 1.0, "consumed_samples": 1479168, "global_step/max_steps": "5778/6362"} +{"lm loss": 4.88429689, "grad_norm": 0.26766774, "learning_rate": 5.21e-06, "elapsed_time_per_iteration": 6.43818831, "memory(GiB)": 21.51, "elapsed_time": "10h 32m 53s", "remaining_time": "1h 3m 50s", "loss_scale": 1.0, "consumed_samples": 1479424, "global_step/max_steps": "5779/6362"} +{"lm loss": 4.87587881, "grad_norm": 0.28240624, "learning_rate": 5.2e-06, "elapsed_time_per_iteration": 6.32316613, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 0s", "remaining_time": "1h 3m 44s", "loss_scale": 1.0, "consumed_samples": 1479680, "global_step/max_steps": "5780/6362"} +{"lm loss": 4.88774633, "grad_norm": 0.28180462, "learning_rate": 5.19e-06, "elapsed_time_per_iteration": 6.61574936, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 6s", "remaining_time": "1h 3m 37s", "loss_scale": 1.0, "consumed_samples": 1479936, "global_step/max_steps": "5781/6362"} +{"lm loss": 4.87038898, "grad_norm": 0.27376553, "learning_rate": 5.19e-06, "elapsed_time_per_iteration": 6.33906412, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 13s", "remaining_time": "1h 3m 31s", "loss_scale": 1.0, "consumed_samples": 1480192, "global_step/max_steps": "5782/6362"} +{"lm loss": 4.88899708, "grad_norm": 0.27445644, "learning_rate": 5.18e-06, "elapsed_time_per_iteration": 6.19902134, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 19s", "remaining_time": "1h 3m 24s", "loss_scale": 1.0, "consumed_samples": 1480448, "global_step/max_steps": "5783/6362"} +{"lm loss": 4.88924313, "grad_norm": 0.28190976, "learning_rate": 5.17e-06, "elapsed_time_per_iteration": 6.28893542, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 25s", "remaining_time": "1h 3m 17s", "loss_scale": 1.0, "consumed_samples": 1480704, "global_step/max_steps": "5784/6362"} +{"lm loss": 4.87148809, "grad_norm": 0.27351058, "learning_rate": 5.17e-06, "elapsed_time_per_iteration": 6.39371705, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 32s", "remaining_time": "1h 3m 11s", "loss_scale": 1.0, "consumed_samples": 1480960, "global_step/max_steps": "5785/6362"} +{"lm loss": 4.87340307, "grad_norm": 0.26989937, "learning_rate": 5.16e-06, "elapsed_time_per_iteration": 6.61910701, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 38s", "remaining_time": "1h 3m 4s", "loss_scale": 1.0, "consumed_samples": 1481216, "global_step/max_steps": "5786/6362"} +{"lm loss": 4.86301279, "grad_norm": 0.25952145, "learning_rate": 5.15e-06, "elapsed_time_per_iteration": 6.26163626, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 44s", "remaining_time": "1h 2m 58s", "loss_scale": 1.0, "consumed_samples": 1481472, "global_step/max_steps": "5787/6362"} +{"lm loss": 4.85113478, "grad_norm": 0.27552599, "learning_rate": 5.14e-06, "elapsed_time_per_iteration": 6.43553543, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 51s", "remaining_time": "1h 2m 51s", "loss_scale": 1.0, "consumed_samples": 1481728, "global_step/max_steps": "5788/6362"} +{"lm loss": 4.86555815, "grad_norm": 0.28549737, "learning_rate": 5.14e-06, "elapsed_time_per_iteration": 6.5952816, "memory(GiB)": 21.51, "elapsed_time": "10h 33m 58s", "remaining_time": "1h 2m 45s", "loss_scale": 1.0, "consumed_samples": 1481984, "global_step/max_steps": "5789/6362"} +{"lm loss": 4.85647202, "grad_norm": 0.27788225, "learning_rate": 5.13e-06, "elapsed_time_per_iteration": 6.52072835, "memory(GiB)": 21.51, "elapsed_time": "10h 34m 4s", "remaining_time": "1h 2m 38s", "loss_scale": 1.0, "consumed_samples": 1482240, "global_step/max_steps": "5790/6362"} +{"lm loss": 4.87407827, "grad_norm": 0.27175245, "learning_rate": 5.12e-06, "elapsed_time_per_iteration": 6.49396229, "memory(GiB)": 21.51, "elapsed_time": "10h 34m 11s", "remaining_time": "1h 2m 31s", "loss_scale": 1.0, "consumed_samples": 1482496, "global_step/max_steps": "5791/6362"} +{"lm loss": 4.87790489, "grad_norm": 0.26796445, "learning_rate": 5.11e-06, "elapsed_time_per_iteration": 6.52600718, "memory(GiB)": 21.51, "elapsed_time": "10h 34m 17s", "remaining_time": "1h 2m 25s", "loss_scale": 1.0, "consumed_samples": 1482752, "global_step/max_steps": "5792/6362"} +{"lm loss": 4.8593483, "grad_norm": 0.27553847, "learning_rate": 5.11e-06, "elapsed_time_per_iteration": 6.7643609, "memory(GiB)": 21.51, "elapsed_time": "10h 34m 24s", "remaining_time": "1h 2m 18s", "loss_scale": 1.0, "consumed_samples": 1483008, "global_step/max_steps": "5793/6362"} +{"lm loss": 4.87817144, "grad_norm": 0.2613019, "learning_rate": 5.1e-06, "elapsed_time_per_iteration": 6.53587389, "memory(GiB)": 21.51, "elapsed_time": "10h 34m 30s", "remaining_time": "1h 2m 12s", "loss_scale": 1.0, "consumed_samples": 1483264, "global_step/max_steps": "5794/6362"} +{"lm loss": 4.86568689, "grad_norm": 0.28200534, "learning_rate": 5.09e-06, "elapsed_time_per_iteration": 6.49142456, "memory(GiB)": 21.51, "elapsed_time": "10h 34m 37s", "remaining_time": "1h 2m 5s", "loss_scale": 1.0, "consumed_samples": 1483520, "global_step/max_steps": "5795/6362"} +{"lm loss": 4.89155674, "grad_norm": 0.29661831, "learning_rate": 5.08e-06, "elapsed_time_per_iteration": 6.55584693, "memory(GiB)": 21.51, "elapsed_time": "10h 34m 43s", "remaining_time": "1h 1m 59s", "loss_scale": 1.0, "consumed_samples": 1483776, "global_step/max_steps": "5796/6362"} +{"lm loss": 4.87137032, "grad_norm": 0.26522237, "learning_rate": 5.08e-06, "elapsed_time_per_iteration": 6.86891603, "memory(GiB)": 21.51, "elapsed_time": "10h 34m 50s", "remaining_time": "1h 1m 52s", "loss_scale": 1.0, "consumed_samples": 1484032, "global_step/max_steps": "5797/6362"} +{"lm loss": 4.8334074, "grad_norm": 0.25719467, "learning_rate": 5.07e-06, "elapsed_time_per_iteration": 6.46802497, "memory(GiB)": 21.51, "elapsed_time": "10h 34m 57s", "remaining_time": "1h 1m 45s", "loss_scale": 1.0, "consumed_samples": 1484288, "global_step/max_steps": "5798/6362"} +{"lm loss": 4.88413763, "grad_norm": 0.28065488, "learning_rate": 5.06e-06, "elapsed_time_per_iteration": 6.77596664, "memory(GiB)": 21.51, "elapsed_time": "10h 35m 4s", "remaining_time": "1h 1m 39s", "loss_scale": 1.0, "consumed_samples": 1484544, "global_step/max_steps": "5799/6362"} +{"lm loss": 4.87948227, "grad_norm": 0.27341241, "learning_rate": 5.05e-06, "elapsed_time_per_iteration": 6.6790545, "memory(GiB)": 21.51, "elapsed_time": "10h 35m 10s", "remaining_time": "1h 1m 32s", "loss_scale": 1.0, "consumed_samples": 1484800, "global_step/max_steps": "5800/6362"} +{"lm loss": 4.85427999, "grad_norm": 0.27332854, "learning_rate": 5.05e-06, "elapsed_time_per_iteration": 6.54677725, "memory(GiB)": 21.51, "elapsed_time": "10h 35m 17s", "remaining_time": "1h 1m 26s", "loss_scale": 1.0, "consumed_samples": 1485056, "global_step/max_steps": "5801/6362"} +{"lm loss": 4.86234951, "grad_norm": 0.26470044, "learning_rate": 5.04e-06, "elapsed_time_per_iteration": 6.99554753, "memory(GiB)": 21.51, "elapsed_time": "10h 35m 24s", "remaining_time": "1h 1m 19s", "loss_scale": 1.0, "consumed_samples": 1485312, "global_step/max_steps": "5802/6362"} +{"lm loss": 4.86243105, "grad_norm": 0.26875749, "learning_rate": 5.03e-06, "elapsed_time_per_iteration": 6.46608996, "memory(GiB)": 21.51, "elapsed_time": "10h 35m 30s", "remaining_time": "1h 1m 13s", "loss_scale": 1.0, "consumed_samples": 1485568, "global_step/max_steps": "5803/6362"} +{"lm loss": 4.86274624, "grad_norm": 0.28273973, "learning_rate": 5.03e-06, "elapsed_time_per_iteration": 6.77856398, "memory(GiB)": 21.51, "elapsed_time": "10h 35m 37s", "remaining_time": "1h 1m 6s", "loss_scale": 1.0, "consumed_samples": 1485824, "global_step/max_steps": "5804/6362"} +{"lm loss": 4.88871336, "grad_norm": 0.27369916, "learning_rate": 5.02e-06, "elapsed_time_per_iteration": 6.53083563, "memory(GiB)": 21.51, "elapsed_time": "10h 35m 44s", "remaining_time": "1h 0m 59s", "loss_scale": 1.0, "consumed_samples": 1486080, "global_step/max_steps": "5805/6362"} +{"lm loss": 4.85268259, "grad_norm": 0.26100898, "learning_rate": 5.01e-06, "elapsed_time_per_iteration": 6.38021636, "memory(GiB)": 21.51, "elapsed_time": "10h 35m 50s", "remaining_time": "1h 0m 53s", "loss_scale": 1.0, "consumed_samples": 1486336, "global_step/max_steps": "5806/6362"} +{"lm loss": 4.84234905, "grad_norm": 0.26227319, "learning_rate": 5e-06, "elapsed_time_per_iteration": 6.56263733, "memory(GiB)": 21.51, "elapsed_time": "10h 35m 56s", "remaining_time": "1h 0m 46s", "loss_scale": 1.0, "consumed_samples": 1486592, "global_step/max_steps": "5807/6362"} +{"lm loss": 4.89485836, "grad_norm": 0.29242009, "learning_rate": 5e-06, "elapsed_time_per_iteration": 6.71148753, "memory(GiB)": 21.51, "elapsed_time": "10h 36m 3s", "remaining_time": "1h 0m 40s", "loss_scale": 1.0, "consumed_samples": 1486848, "global_step/max_steps": "5808/6362"} +{"lm loss": 4.83684635, "grad_norm": 0.26776761, "learning_rate": 4.99e-06, "elapsed_time_per_iteration": 6.73887658, "memory(GiB)": 21.51, "elapsed_time": "10h 36m 10s", "remaining_time": "1h 0m 33s", "loss_scale": 1.0, "consumed_samples": 1487104, "global_step/max_steps": "5809/6362"} +{"lm loss": 4.88569832, "grad_norm": 0.25244832, "learning_rate": 4.98e-06, "elapsed_time_per_iteration": 6.58974051, "memory(GiB)": 21.51, "elapsed_time": "10h 36m 16s", "remaining_time": "1h 0m 27s", "loss_scale": 1.0, "consumed_samples": 1487360, "global_step/max_steps": "5810/6362"} +{"lm loss": 4.84903049, "grad_norm": 0.28118628, "learning_rate": 4.98e-06, "elapsed_time_per_iteration": 6.7891016, "memory(GiB)": 21.51, "elapsed_time": "10h 36m 23s", "remaining_time": "1h 0m 20s", "loss_scale": 1.0, "consumed_samples": 1487616, "global_step/max_steps": "5811/6362"} +{"lm loss": 4.85714579, "grad_norm": 0.26288736, "learning_rate": 4.97e-06, "elapsed_time_per_iteration": 6.63370705, "memory(GiB)": 21.51, "elapsed_time": "10h 36m 30s", "remaining_time": "1h 0m 14s", "loss_scale": 1.0, "consumed_samples": 1487872, "global_step/max_steps": "5812/6362"} +{"lm loss": 4.86943483, "grad_norm": 0.2656666, "learning_rate": 4.96e-06, "elapsed_time_per_iteration": 6.75729203, "memory(GiB)": 21.51, "elapsed_time": "10h 36m 37s", "remaining_time": "1h 0m 7s", "loss_scale": 1.0, "consumed_samples": 1488128, "global_step/max_steps": "5813/6362"} +{"lm loss": 4.87323809, "grad_norm": 0.2653178, "learning_rate": 4.95e-06, "elapsed_time_per_iteration": 6.57471991, "memory(GiB)": 21.51, "elapsed_time": "10h 36m 43s", "remaining_time": "1h 0m 0s", "loss_scale": 1.0, "consumed_samples": 1488384, "global_step/max_steps": "5814/6362"} +{"lm loss": 4.87430048, "grad_norm": 0.27013576, "learning_rate": 4.95e-06, "elapsed_time_per_iteration": 6.54689932, "memory(GiB)": 21.51, "elapsed_time": "10h 36m 50s", "remaining_time": "59m 54s", "loss_scale": 1.0, "consumed_samples": 1488640, "global_step/max_steps": "5815/6362"} +{"lm loss": 4.89848375, "grad_norm": 0.26791394, "learning_rate": 4.94e-06, "elapsed_time_per_iteration": 6.59392905, "memory(GiB)": 21.51, "elapsed_time": "10h 36m 56s", "remaining_time": "59m 47s", "loss_scale": 1.0, "consumed_samples": 1488896, "global_step/max_steps": "5816/6362"} +{"lm loss": 4.84461832, "grad_norm": 0.27533475, "learning_rate": 4.93e-06, "elapsed_time_per_iteration": 6.49321246, "memory(GiB)": 21.51, "elapsed_time": "10h 37m 3s", "remaining_time": "59m 41s", "loss_scale": 1.0, "consumed_samples": 1489152, "global_step/max_steps": "5817/6362"} +{"lm loss": 4.83993816, "grad_norm": 0.26342073, "learning_rate": 4.93e-06, "elapsed_time_per_iteration": 6.92800951, "memory(GiB)": 21.51, "elapsed_time": "10h 37m 10s", "remaining_time": "59m 34s", "loss_scale": 1.0, "consumed_samples": 1489408, "global_step/max_steps": "5818/6362"} +{"lm loss": 4.86774158, "grad_norm": 0.26514775, "learning_rate": 4.92e-06, "elapsed_time_per_iteration": 6.52593398, "memory(GiB)": 21.51, "elapsed_time": "10h 37m 16s", "remaining_time": "59m 28s", "loss_scale": 1.0, "consumed_samples": 1489664, "global_step/max_steps": "5819/6362"} +{"lm loss": 4.84310532, "grad_norm": 0.27959514, "learning_rate": 4.91e-06, "elapsed_time_per_iteration": 6.67577386, "memory(GiB)": 21.51, "elapsed_time": "10h 37m 23s", "remaining_time": "59m 21s", "loss_scale": 1.0, "consumed_samples": 1489920, "global_step/max_steps": "5820/6362"} +{"lm loss": 4.86228228, "grad_norm": 0.2732513, "learning_rate": 4.91e-06, "elapsed_time_per_iteration": 6.67591119, "memory(GiB)": 21.51, "elapsed_time": "10h 37m 30s", "remaining_time": "59m 14s", "loss_scale": 1.0, "consumed_samples": 1490176, "global_step/max_steps": "5821/6362"} +{"lm loss": 4.86019516, "grad_norm": 0.28106016, "learning_rate": 4.9e-06, "elapsed_time_per_iteration": 6.61948895, "memory(GiB)": 21.51, "elapsed_time": "10h 37m 36s", "remaining_time": "59m 8s", "loss_scale": 1.0, "consumed_samples": 1490432, "global_step/max_steps": "5822/6362"} +{"lm loss": 4.85381699, "grad_norm": 0.26808465, "learning_rate": 4.89e-06, "elapsed_time_per_iteration": 6.59262252, "memory(GiB)": 21.51, "elapsed_time": "10h 37m 43s", "remaining_time": "59m 1s", "loss_scale": 1.0, "consumed_samples": 1490688, "global_step/max_steps": "5823/6362"} +{"lm loss": 4.87641668, "grad_norm": 0.25266775, "learning_rate": 4.88e-06, "elapsed_time_per_iteration": 6.75326872, "memory(GiB)": 21.51, "elapsed_time": "10h 37m 50s", "remaining_time": "58m 55s", "loss_scale": 1.0, "consumed_samples": 1490944, "global_step/max_steps": "5824/6362"} +{"lm loss": 4.88871908, "grad_norm": 0.29593512, "learning_rate": 4.88e-06, "elapsed_time_per_iteration": 6.63518715, "memory(GiB)": 21.51, "elapsed_time": "10h 37m 56s", "remaining_time": "58m 48s", "loss_scale": 1.0, "consumed_samples": 1491200, "global_step/max_steps": "5825/6362"} +{"lm loss": 4.89675999, "grad_norm": 0.26448569, "learning_rate": 4.87e-06, "elapsed_time_per_iteration": 6.84448814, "memory(GiB)": 21.51, "elapsed_time": "10h 38m 3s", "remaining_time": "58m 42s", "loss_scale": 1.0, "consumed_samples": 1491456, "global_step/max_steps": "5826/6362"} +{"lm loss": 4.85203505, "grad_norm": 0.26680321, "learning_rate": 4.86e-06, "elapsed_time_per_iteration": 6.78874516, "memory(GiB)": 21.51, "elapsed_time": "10h 38m 10s", "remaining_time": "58m 35s", "loss_scale": 1.0, "consumed_samples": 1491712, "global_step/max_steps": "5827/6362"} +{"lm loss": 4.85056305, "grad_norm": 0.27764893, "learning_rate": 4.86e-06, "elapsed_time_per_iteration": 6.60413575, "memory(GiB)": 21.51, "elapsed_time": "10h 38m 17s", "remaining_time": "58m 29s", "loss_scale": 1.0, "consumed_samples": 1491968, "global_step/max_steps": "5828/6362"} +{"lm loss": 4.85090399, "grad_norm": 0.27837032, "learning_rate": 4.85e-06, "elapsed_time_per_iteration": 6.79574704, "memory(GiB)": 21.51, "elapsed_time": "10h 38m 23s", "remaining_time": "58m 22s", "loss_scale": 1.0, "consumed_samples": 1492224, "global_step/max_steps": "5829/6362"} +{"lm loss": 4.8699317, "grad_norm": 0.26310003, "learning_rate": 4.84e-06, "elapsed_time_per_iteration": 6.73787642, "memory(GiB)": 21.51, "elapsed_time": "10h 38m 30s", "remaining_time": "58m 15s", "loss_scale": 1.0, "consumed_samples": 1492480, "global_step/max_steps": "5830/6362"} +{"lm loss": 4.87213612, "grad_norm": 0.27061796, "learning_rate": 4.84e-06, "elapsed_time_per_iteration": 6.81454849, "memory(GiB)": 21.51, "elapsed_time": "10h 38m 37s", "remaining_time": "58m 9s", "loss_scale": 1.0, "consumed_samples": 1492736, "global_step/max_steps": "5831/6362"} +{"lm loss": 4.86767197, "grad_norm": 0.26968858, "learning_rate": 4.83e-06, "elapsed_time_per_iteration": 6.62479377, "memory(GiB)": 21.51, "elapsed_time": "10h 38m 43s", "remaining_time": "58m 2s", "loss_scale": 1.0, "consumed_samples": 1492992, "global_step/max_steps": "5832/6362"} +{"lm loss": 4.86518002, "grad_norm": 0.26352537, "learning_rate": 4.82e-06, "elapsed_time_per_iteration": 6.85201454, "memory(GiB)": 21.51, "elapsed_time": "10h 38m 50s", "remaining_time": "57m 56s", "loss_scale": 1.0, "consumed_samples": 1493248, "global_step/max_steps": "5833/6362"} +{"lm loss": 4.87652969, "grad_norm": 0.26974839, "learning_rate": 4.82e-06, "elapsed_time_per_iteration": 6.61743617, "memory(GiB)": 21.51, "elapsed_time": "10h 38m 57s", "remaining_time": "57m 49s", "loss_scale": 1.0, "consumed_samples": 1493504, "global_step/max_steps": "5834/6362"} +{"lm loss": 4.87420607, "grad_norm": 0.26461986, "learning_rate": 4.81e-06, "elapsed_time_per_iteration": 6.55734825, "memory(GiB)": 21.51, "elapsed_time": "10h 39m 4s", "remaining_time": "57m 43s", "loss_scale": 1.0, "consumed_samples": 1493760, "global_step/max_steps": "5835/6362"} +{"lm loss": 4.87318802, "grad_norm": 0.25922081, "learning_rate": 4.8e-06, "elapsed_time_per_iteration": 6.55420732, "memory(GiB)": 21.51, "elapsed_time": "10h 39m 10s", "remaining_time": "57m 36s", "loss_scale": 1.0, "consumed_samples": 1494016, "global_step/max_steps": "5836/6362"} +{"lm loss": 4.84965038, "grad_norm": 0.25990194, "learning_rate": 4.79e-06, "elapsed_time_per_iteration": 6.52742386, "memory(GiB)": 21.51, "elapsed_time": "10h 39m 17s", "remaining_time": "57m 29s", "loss_scale": 1.0, "consumed_samples": 1494272, "global_step/max_steps": "5837/6362"} +{"lm loss": 4.86841822, "grad_norm": 0.26857895, "learning_rate": 4.79e-06, "elapsed_time_per_iteration": 6.61570382, "memory(GiB)": 21.51, "elapsed_time": "10h 39m 23s", "remaining_time": "57m 23s", "loss_scale": 1.0, "consumed_samples": 1494528, "global_step/max_steps": "5838/6362"} +{"lm loss": 4.87203217, "grad_norm": 0.27585968, "learning_rate": 4.78e-06, "elapsed_time_per_iteration": 6.68635297, "memory(GiB)": 21.51, "elapsed_time": "10h 39m 30s", "remaining_time": "57m 16s", "loss_scale": 1.0, "consumed_samples": 1494784, "global_step/max_steps": "5839/6362"} +{"lm loss": 4.88969374, "grad_norm": 0.26841083, "learning_rate": 4.77e-06, "elapsed_time_per_iteration": 6.66769099, "memory(GiB)": 21.51, "elapsed_time": "10h 39m 37s", "remaining_time": "57m 10s", "loss_scale": 1.0, "consumed_samples": 1495040, "global_step/max_steps": "5840/6362"} +{"lm loss": 4.86306858, "grad_norm": 0.26744315, "learning_rate": 4.77e-06, "elapsed_time_per_iteration": 6.62393689, "memory(GiB)": 21.51, "elapsed_time": "10h 39m 43s", "remaining_time": "57m 3s", "loss_scale": 1.0, "consumed_samples": 1495296, "global_step/max_steps": "5841/6362"} +{"lm loss": 4.84880018, "grad_norm": 0.27155781, "learning_rate": 4.76e-06, "elapsed_time_per_iteration": 6.52698112, "memory(GiB)": 21.51, "elapsed_time": "10h 39m 50s", "remaining_time": "56m 57s", "loss_scale": 1.0, "consumed_samples": 1495552, "global_step/max_steps": "5842/6362"} +{"lm loss": 4.85510349, "grad_norm": 0.26037458, "learning_rate": 4.75e-06, "elapsed_time_per_iteration": 6.52212071, "memory(GiB)": 21.51, "elapsed_time": "10h 39m 56s", "remaining_time": "56m 50s", "loss_scale": 1.0, "consumed_samples": 1495808, "global_step/max_steps": "5843/6362"} +{"lm loss": 4.8485465, "grad_norm": 0.26276779, "learning_rate": 4.75e-06, "elapsed_time_per_iteration": 6.58589005, "memory(GiB)": 21.51, "elapsed_time": "10h 40m 3s", "remaining_time": "56m 43s", "loss_scale": 1.0, "consumed_samples": 1496064, "global_step/max_steps": "5844/6362"} +{"lm loss": 4.89132023, "grad_norm": 0.2601316, "learning_rate": 4.74e-06, "elapsed_time_per_iteration": 6.62602592, "memory(GiB)": 21.51, "elapsed_time": "10h 40m 9s", "remaining_time": "56m 37s", "loss_scale": 1.0, "consumed_samples": 1496320, "global_step/max_steps": "5845/6362"} +{"lm loss": 4.89319134, "grad_norm": 0.25408399, "learning_rate": 4.73e-06, "elapsed_time_per_iteration": 6.35073686, "memory(GiB)": 21.51, "elapsed_time": "10h 40m 16s", "remaining_time": "56m 30s", "loss_scale": 1.0, "consumed_samples": 1496576, "global_step/max_steps": "5846/6362"} +{"lm loss": 4.86166859, "grad_norm": 0.2613261, "learning_rate": 4.73e-06, "elapsed_time_per_iteration": 6.73270297, "memory(GiB)": 21.51, "elapsed_time": "10h 40m 23s", "remaining_time": "56m 24s", "loss_scale": 1.0, "consumed_samples": 1496832, "global_step/max_steps": "5847/6362"} +{"lm loss": 4.87417793, "grad_norm": 0.26058137, "learning_rate": 4.72e-06, "elapsed_time_per_iteration": 6.41573143, "memory(GiB)": 21.51, "elapsed_time": "10h 40m 29s", "remaining_time": "56m 17s", "loss_scale": 1.0, "consumed_samples": 1497088, "global_step/max_steps": "5848/6362"} +{"lm loss": 4.84327745, "grad_norm": 0.27586311, "learning_rate": 4.71e-06, "elapsed_time_per_iteration": 6.34069324, "memory(GiB)": 21.51, "elapsed_time": "10h 40m 35s", "remaining_time": "56m 11s", "loss_scale": 1.0, "consumed_samples": 1497344, "global_step/max_steps": "5849/6362"} +{"lm loss": 4.87309504, "grad_norm": 0.25637236, "learning_rate": 4.71e-06, "elapsed_time_per_iteration": 6.30198002, "memory(GiB)": 21.51, "elapsed_time": "10h 40m 42s", "remaining_time": "56m 4s", "loss_scale": 1.0, "consumed_samples": 1497600, "global_step/max_steps": "5850/6362"} +{"lm loss": 4.86499071, "grad_norm": 0.26240978, "learning_rate": 4.7e-06, "elapsed_time_per_iteration": 6.32349896, "memory(GiB)": 21.51, "elapsed_time": "10h 40m 48s", "remaining_time": "55m 57s", "loss_scale": 1.0, "consumed_samples": 1497856, "global_step/max_steps": "5851/6362"} +{"lm loss": 4.86702585, "grad_norm": 0.26490429, "learning_rate": 4.69e-06, "elapsed_time_per_iteration": 6.77434874, "memory(GiB)": 21.51, "elapsed_time": "10h 40m 55s", "remaining_time": "55m 51s", "loss_scale": 1.0, "consumed_samples": 1498112, "global_step/max_steps": "5852/6362"} +{"lm loss": 4.87960052, "grad_norm": 0.26081792, "learning_rate": 4.69e-06, "elapsed_time_per_iteration": 6.55650449, "memory(GiB)": 21.51, "elapsed_time": "10h 41m 1s", "remaining_time": "55m 44s", "loss_scale": 1.0, "consumed_samples": 1498368, "global_step/max_steps": "5853/6362"} +{"lm loss": 4.85460186, "grad_norm": 0.26517653, "learning_rate": 4.68e-06, "elapsed_time_per_iteration": 6.55294704, "memory(GiB)": 21.51, "elapsed_time": "10h 41m 8s", "remaining_time": "55m 38s", "loss_scale": 1.0, "consumed_samples": 1498624, "global_step/max_steps": "5854/6362"} +{"lm loss": 4.88266611, "grad_norm": 0.26503217, "learning_rate": 4.67e-06, "elapsed_time_per_iteration": 6.61536002, "memory(GiB)": 21.51, "elapsed_time": "10h 41m 14s", "remaining_time": "55m 31s", "loss_scale": 1.0, "consumed_samples": 1498880, "global_step/max_steps": "5855/6362"} +{"lm loss": 4.88084412, "grad_norm": 0.26582363, "learning_rate": 4.67e-06, "elapsed_time_per_iteration": 6.49166441, "memory(GiB)": 21.51, "elapsed_time": "10h 41m 21s", "remaining_time": "55m 25s", "loss_scale": 1.0, "consumed_samples": 1499136, "global_step/max_steps": "5856/6362"} +{"lm loss": 4.86334419, "grad_norm": 0.26516566, "learning_rate": 4.66e-06, "elapsed_time_per_iteration": 6.61152053, "memory(GiB)": 21.51, "elapsed_time": "10h 41m 28s", "remaining_time": "55m 18s", "loss_scale": 1.0, "consumed_samples": 1499392, "global_step/max_steps": "5857/6362"} +{"lm loss": 4.87008238, "grad_norm": 0.27028605, "learning_rate": 4.65e-06, "elapsed_time_per_iteration": 6.50751567, "memory(GiB)": 21.51, "elapsed_time": "10h 41m 34s", "remaining_time": "55m 11s", "loss_scale": 1.0, "consumed_samples": 1499648, "global_step/max_steps": "5858/6362"} +{"lm loss": 4.852036, "grad_norm": 0.28821152, "learning_rate": 4.65e-06, "elapsed_time_per_iteration": 6.68231344, "memory(GiB)": 21.51, "elapsed_time": "10h 41m 41s", "remaining_time": "55m 5s", "loss_scale": 1.0, "consumed_samples": 1499904, "global_step/max_steps": "5859/6362"} +{"lm loss": 4.85897636, "grad_norm": 0.26147547, "learning_rate": 4.64e-06, "elapsed_time_per_iteration": 6.5186789, "memory(GiB)": 21.51, "elapsed_time": "10h 41m 47s", "remaining_time": "54m 58s", "loss_scale": 1.0, "consumed_samples": 1500160, "global_step/max_steps": "5860/6362"} +{"lm loss": 4.87572527, "grad_norm": 0.26949605, "learning_rate": 4.64e-06, "elapsed_time_per_iteration": 6.55794215, "memory(GiB)": 21.51, "elapsed_time": "10h 41m 54s", "remaining_time": "54m 52s", "loss_scale": 1.0, "consumed_samples": 1500416, "global_step/max_steps": "5861/6362"} +{"lm loss": 4.8761425, "grad_norm": 0.27823153, "learning_rate": 4.63e-06, "elapsed_time_per_iteration": 6.71216702, "memory(GiB)": 21.51, "elapsed_time": "10h 42m 1s", "remaining_time": "54m 45s", "loss_scale": 1.0, "consumed_samples": 1500672, "global_step/max_steps": "5862/6362"} +{"lm loss": 4.86766386, "grad_norm": 0.26791537, "learning_rate": 4.62e-06, "elapsed_time_per_iteration": 6.56311512, "memory(GiB)": 21.51, "elapsed_time": "10h 42m 7s", "remaining_time": "54m 39s", "loss_scale": 1.0, "consumed_samples": 1500928, "global_step/max_steps": "5863/6362"} +{"lm loss": 4.87006521, "grad_norm": 0.29583323, "learning_rate": 4.62e-06, "elapsed_time_per_iteration": 6.55716848, "memory(GiB)": 21.51, "elapsed_time": "10h 42m 14s", "remaining_time": "54m 32s", "loss_scale": 1.0, "consumed_samples": 1501184, "global_step/max_steps": "5864/6362"} +{"lm loss": 4.85465908, "grad_norm": 0.26300749, "learning_rate": 4.61e-06, "elapsed_time_per_iteration": 6.55858231, "memory(GiB)": 21.51, "elapsed_time": "10h 42m 20s", "remaining_time": "54m 25s", "loss_scale": 1.0, "consumed_samples": 1501440, "global_step/max_steps": "5865/6362"} +{"lm loss": 4.86778021, "grad_norm": 0.26208586, "learning_rate": 4.6e-06, "elapsed_time_per_iteration": 6.67220449, "memory(GiB)": 21.51, "elapsed_time": "10h 42m 27s", "remaining_time": "54m 19s", "loss_scale": 1.0, "consumed_samples": 1501696, "global_step/max_steps": "5866/6362"} +{"lm loss": 4.87555504, "grad_norm": 0.26572859, "learning_rate": 4.6e-06, "elapsed_time_per_iteration": 6.45689106, "memory(GiB)": 21.51, "elapsed_time": "10h 42m 33s", "remaining_time": "54m 12s", "loss_scale": 1.0, "consumed_samples": 1501952, "global_step/max_steps": "5867/6362"} +{"lm loss": 4.89272118, "grad_norm": 0.25193316, "learning_rate": 4.59e-06, "elapsed_time_per_iteration": 6.50561619, "memory(GiB)": 21.51, "elapsed_time": "10h 42m 40s", "remaining_time": "54m 6s", "loss_scale": 1.0, "consumed_samples": 1502208, "global_step/max_steps": "5868/6362"} +{"lm loss": 4.90465832, "grad_norm": 0.2618739, "learning_rate": 4.58e-06, "elapsed_time_per_iteration": 6.72728014, "memory(GiB)": 21.51, "elapsed_time": "10h 42m 47s", "remaining_time": "53m 59s", "loss_scale": 1.0, "consumed_samples": 1502464, "global_step/max_steps": "5869/6362"} +{"lm loss": 4.8735137, "grad_norm": 0.26766607, "learning_rate": 4.58e-06, "elapsed_time_per_iteration": 6.49642658, "memory(GiB)": 21.51, "elapsed_time": "10h 42m 53s", "remaining_time": "53m 53s", "loss_scale": 1.0, "consumed_samples": 1502720, "global_step/max_steps": "5870/6362"} +{"lm loss": 4.88080931, "grad_norm": 0.27986559, "learning_rate": 4.57e-06, "elapsed_time_per_iteration": 6.68551254, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 0s", "remaining_time": "53m 46s", "loss_scale": 1.0, "consumed_samples": 1502976, "global_step/max_steps": "5871/6362"} +{"lm loss": 4.84822845, "grad_norm": 0.26060566, "learning_rate": 4.56e-06, "elapsed_time_per_iteration": 6.51357937, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 6s", "remaining_time": "53m 39s", "loss_scale": 1.0, "consumed_samples": 1503232, "global_step/max_steps": "5872/6362"} +{"lm loss": 4.87294626, "grad_norm": 0.26894099, "learning_rate": 4.56e-06, "elapsed_time_per_iteration": 6.5872128, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 13s", "remaining_time": "53m 33s", "loss_scale": 1.0, "consumed_samples": 1503488, "global_step/max_steps": "5873/6362"} +{"lm loss": 4.85891438, "grad_norm": 0.26340261, "learning_rate": 4.55e-06, "elapsed_time_per_iteration": 6.7296443, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 20s", "remaining_time": "53m 26s", "loss_scale": 1.0, "consumed_samples": 1503744, "global_step/max_steps": "5874/6362"} +{"lm loss": 4.84955883, "grad_norm": 0.27632308, "learning_rate": 4.55e-06, "elapsed_time_per_iteration": 6.65350008, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 26s", "remaining_time": "53m 20s", "loss_scale": 1.0, "consumed_samples": 1504000, "global_step/max_steps": "5875/6362"} +{"lm loss": 4.87308025, "grad_norm": 0.26171085, "learning_rate": 4.54e-06, "elapsed_time_per_iteration": 6.49567986, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 33s", "remaining_time": "53m 13s", "loss_scale": 1.0, "consumed_samples": 1504256, "global_step/max_steps": "5876/6362"} +{"lm loss": 4.86806488, "grad_norm": 0.2515246, "learning_rate": 4.53e-06, "elapsed_time_per_iteration": 6.46444273, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 39s", "remaining_time": "53m 7s", "loss_scale": 1.0, "consumed_samples": 1504512, "global_step/max_steps": "5877/6362"} +{"lm loss": 4.88530922, "grad_norm": 0.26346648, "learning_rate": 4.53e-06, "elapsed_time_per_iteration": 6.62459636, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 46s", "remaining_time": "53m 0s", "loss_scale": 1.0, "consumed_samples": 1504768, "global_step/max_steps": "5878/6362"} +{"lm loss": 4.86761522, "grad_norm": 0.27227521, "learning_rate": 4.52e-06, "elapsed_time_per_iteration": 6.56626749, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 52s", "remaining_time": "52m 53s", "loss_scale": 1.0, "consumed_samples": 1505024, "global_step/max_steps": "5879/6362"} +{"lm loss": 4.85570431, "grad_norm": 0.27927703, "learning_rate": 4.51e-06, "elapsed_time_per_iteration": 6.48304033, "memory(GiB)": 21.51, "elapsed_time": "10h 43m 59s", "remaining_time": "52m 47s", "loss_scale": 1.0, "consumed_samples": 1505280, "global_step/max_steps": "5880/6362"} +{"lm loss": 4.81856823, "grad_norm": 0.26998472, "learning_rate": 4.51e-06, "elapsed_time_per_iteration": 6.49197555, "memory(GiB)": 21.51, "elapsed_time": "10h 44m 5s", "remaining_time": "52m 40s", "loss_scale": 1.0, "consumed_samples": 1505536, "global_step/max_steps": "5881/6362"} +{"lm loss": 4.87542963, "grad_norm": 0.26120517, "learning_rate": 4.5e-06, "elapsed_time_per_iteration": 6.59852195, "memory(GiB)": 21.51, "elapsed_time": "10h 44m 12s", "remaining_time": "52m 34s", "loss_scale": 1.0, "consumed_samples": 1505792, "global_step/max_steps": "5882/6362"} +{"lm loss": 4.85122395, "grad_norm": 0.28237337, "learning_rate": 4.5e-06, "elapsed_time_per_iteration": 6.57829428, "memory(GiB)": 21.51, "elapsed_time": "10h 44m 19s", "remaining_time": "52m 27s", "loss_scale": 1.0, "consumed_samples": 1506048, "global_step/max_steps": "5883/6362"} +{"lm loss": 4.8743968, "grad_norm": 0.27288935, "learning_rate": 4.49e-06, "elapsed_time_per_iteration": 6.49931526, "memory(GiB)": 21.51, "elapsed_time": "10h 44m 25s", "remaining_time": "52m 21s", "loss_scale": 1.0, "consumed_samples": 1506304, "global_step/max_steps": "5884/6362"} +{"lm loss": 4.85964155, "grad_norm": 0.26982784, "learning_rate": 4.48e-06, "elapsed_time_per_iteration": 6.52882528, "memory(GiB)": 21.51, "elapsed_time": "10h 44m 32s", "remaining_time": "52m 14s", "loss_scale": 1.0, "consumed_samples": 1506560, "global_step/max_steps": "5885/6362"} +{"lm loss": 4.87449932, "grad_norm": 0.27262381, "learning_rate": 4.48e-06, "elapsed_time_per_iteration": 6.37656355, "memory(GiB)": 21.51, "elapsed_time": "10h 44m 38s", "remaining_time": "52m 7s", "loss_scale": 1.0, "consumed_samples": 1506816, "global_step/max_steps": "5886/6362"} +{"lm loss": 4.86578417, "grad_norm": 0.27696803, "learning_rate": 4.47e-06, "elapsed_time_per_iteration": 6.55948448, "memory(GiB)": 21.51, "elapsed_time": "10h 44m 44s", "remaining_time": "52m 1s", "loss_scale": 1.0, "consumed_samples": 1507072, "global_step/max_steps": "5887/6362"} +{"lm loss": 4.85542297, "grad_norm": 0.26569483, "learning_rate": 4.46e-06, "elapsed_time_per_iteration": 6.50292945, "memory(GiB)": 21.51, "elapsed_time": "10h 44m 51s", "remaining_time": "51m 54s", "loss_scale": 1.0, "consumed_samples": 1507328, "global_step/max_steps": "5888/6362"} +{"lm loss": 4.88192701, "grad_norm": 0.28161633, "learning_rate": 4.46e-06, "elapsed_time_per_iteration": 6.61776567, "memory(GiB)": 21.51, "elapsed_time": "10h 44m 58s", "remaining_time": "51m 48s", "loss_scale": 1.0, "consumed_samples": 1507584, "global_step/max_steps": "5889/6362"} +{"lm loss": 4.84195662, "grad_norm": 0.26961178, "learning_rate": 4.45e-06, "elapsed_time_per_iteration": 6.43125463, "memory(GiB)": 21.51, "elapsed_time": "10h 45m 4s", "remaining_time": "51m 41s", "loss_scale": 1.0, "consumed_samples": 1507840, "global_step/max_steps": "5890/6362"} +{"lm loss": 4.86599588, "grad_norm": 0.28155336, "learning_rate": 4.45e-06, "elapsed_time_per_iteration": 6.41771817, "memory(GiB)": 21.51, "elapsed_time": "10h 45m 10s", "remaining_time": "51m 35s", "loss_scale": 1.0, "consumed_samples": 1508096, "global_step/max_steps": "5891/6362"} +{"lm loss": 4.87953281, "grad_norm": 0.28151685, "learning_rate": 4.44e-06, "elapsed_time_per_iteration": 6.31271505, "memory(GiB)": 21.51, "elapsed_time": "10h 45m 17s", "remaining_time": "51m 28s", "loss_scale": 1.0, "consumed_samples": 1508352, "global_step/max_steps": "5892/6362"} +{"lm loss": 4.86329794, "grad_norm": 0.26805854, "learning_rate": 4.43e-06, "elapsed_time_per_iteration": 6.87229729, "memory(GiB)": 21.51, "elapsed_time": "10h 45m 24s", "remaining_time": "51m 21s", "loss_scale": 1.0, "consumed_samples": 1508608, "global_step/max_steps": "5893/6362"} +{"lm loss": 4.84564352, "grad_norm": 0.28198713, "learning_rate": 4.43e-06, "elapsed_time_per_iteration": 6.62509513, "memory(GiB)": 21.51, "elapsed_time": "10h 45m 30s", "remaining_time": "51m 15s", "loss_scale": 1.0, "consumed_samples": 1508864, "global_step/max_steps": "5894/6362"} +{"lm loss": 4.87460852, "grad_norm": 0.28964621, "learning_rate": 4.42e-06, "elapsed_time_per_iteration": 6.4625752, "memory(GiB)": 21.51, "elapsed_time": "10h 45m 37s", "remaining_time": "51m 8s", "loss_scale": 1.0, "consumed_samples": 1509120, "global_step/max_steps": "5895/6362"} +{"lm loss": 4.88534117, "grad_norm": 0.2653791, "learning_rate": 4.42e-06, "elapsed_time_per_iteration": 6.58328533, "memory(GiB)": 21.51, "elapsed_time": "10h 45m 43s", "remaining_time": "51m 2s", "loss_scale": 1.0, "consumed_samples": 1509376, "global_step/max_steps": "5896/6362"} +{"lm loss": 4.87313747, "grad_norm": 0.27981424, "learning_rate": 4.41e-06, "elapsed_time_per_iteration": 6.65474057, "memory(GiB)": 21.51, "elapsed_time": "10h 45m 50s", "remaining_time": "50m 55s", "loss_scale": 1.0, "consumed_samples": 1509632, "global_step/max_steps": "5897/6362"} +{"lm loss": 4.8494091, "grad_norm": 0.26054931, "learning_rate": 4.4e-06, "elapsed_time_per_iteration": 6.68041778, "memory(GiB)": 21.51, "elapsed_time": "10h 45m 57s", "remaining_time": "50m 49s", "loss_scale": 1.0, "consumed_samples": 1509888, "global_step/max_steps": "5898/6362"} +{"lm loss": 4.85898256, "grad_norm": 0.27730489, "learning_rate": 4.4e-06, "elapsed_time_per_iteration": 6.60932922, "memory(GiB)": 21.51, "elapsed_time": "10h 46m 3s", "remaining_time": "50m 42s", "loss_scale": 1.0, "consumed_samples": 1510144, "global_step/max_steps": "5899/6362"} +{"lm loss": 4.8666091, "grad_norm": 0.26750517, "learning_rate": 4.39e-06, "elapsed_time_per_iteration": 6.67356491, "memory(GiB)": 21.51, "elapsed_time": "10h 46m 10s", "remaining_time": "50m 35s", "loss_scale": 1.0, "consumed_samples": 1510400, "global_step/max_steps": "5900/6362"} +{"lm loss": 4.85813618, "grad_norm": 0.26767555, "learning_rate": 4.39e-06, "elapsed_time_per_iteration": 6.56467772, "memory(GiB)": 21.51, "elapsed_time": "10h 46m 16s", "remaining_time": "50m 29s", "loss_scale": 1.0, "consumed_samples": 1510656, "global_step/max_steps": "5901/6362"} +{"lm loss": 4.85295916, "grad_norm": 0.27994186, "learning_rate": 4.38e-06, "elapsed_time_per_iteration": 6.6295774, "memory(GiB)": 21.51, "elapsed_time": "10h 46m 23s", "remaining_time": "50m 22s", "loss_scale": 1.0, "consumed_samples": 1510912, "global_step/max_steps": "5902/6362"} +{"lm loss": 4.88582277, "grad_norm": 0.27151024, "learning_rate": 4.37e-06, "elapsed_time_per_iteration": 6.49012113, "memory(GiB)": 21.51, "elapsed_time": "10h 46m 30s", "remaining_time": "50m 16s", "loss_scale": 1.0, "consumed_samples": 1511168, "global_step/max_steps": "5903/6362"} +{"lm loss": 4.83787584, "grad_norm": 0.27015534, "learning_rate": 4.37e-06, "elapsed_time_per_iteration": 6.8658576, "memory(GiB)": 21.51, "elapsed_time": "10h 46m 36s", "remaining_time": "50m 9s", "loss_scale": 1.0, "consumed_samples": 1511424, "global_step/max_steps": "5904/6362"} +{"lm loss": 4.85108519, "grad_norm": 0.26972041, "learning_rate": 4.36e-06, "elapsed_time_per_iteration": 6.67684531, "memory(GiB)": 21.51, "elapsed_time": "10h 46m 43s", "remaining_time": "50m 3s", "loss_scale": 1.0, "consumed_samples": 1511680, "global_step/max_steps": "5905/6362"} +{"lm loss": 4.83961391, "grad_norm": 0.25719467, "learning_rate": 4.36e-06, "elapsed_time_per_iteration": 6.4676218, "memory(GiB)": 21.51, "elapsed_time": "10h 46m 50s", "remaining_time": "49m 56s", "loss_scale": 1.0, "consumed_samples": 1511936, "global_step/max_steps": "5906/6362"} +{"lm loss": 4.87951851, "grad_norm": 0.25771654, "learning_rate": 4.35e-06, "elapsed_time_per_iteration": 6.61342335, "memory(GiB)": 21.51, "elapsed_time": "10h 46m 56s", "remaining_time": "49m 49s", "loss_scale": 1.0, "consumed_samples": 1512192, "global_step/max_steps": "5907/6362"} +{"lm loss": 4.88895512, "grad_norm": 0.2766861, "learning_rate": 4.34e-06, "elapsed_time_per_iteration": 6.44925714, "memory(GiB)": 21.51, "elapsed_time": "10h 47m 3s", "remaining_time": "49m 43s", "loss_scale": 1.0, "consumed_samples": 1512448, "global_step/max_steps": "5908/6362"} +{"lm loss": 4.87323618, "grad_norm": 0.26961121, "learning_rate": 4.34e-06, "elapsed_time_per_iteration": 6.5790596, "memory(GiB)": 21.51, "elapsed_time": "10h 47m 9s", "remaining_time": "49m 36s", "loss_scale": 1.0, "consumed_samples": 1512704, "global_step/max_steps": "5909/6362"} +{"lm loss": 4.88939095, "grad_norm": 0.25443313, "learning_rate": 4.33e-06, "elapsed_time_per_iteration": 6.42343831, "memory(GiB)": 21.51, "elapsed_time": "10h 47m 16s", "remaining_time": "49m 30s", "loss_scale": 1.0, "consumed_samples": 1512960, "global_step/max_steps": "5910/6362"} +{"lm loss": 4.84290695, "grad_norm": 0.25654611, "learning_rate": 4.33e-06, "elapsed_time_per_iteration": 6.51878262, "memory(GiB)": 21.51, "elapsed_time": "10h 47m 22s", "remaining_time": "49m 23s", "loss_scale": 1.0, "consumed_samples": 1513216, "global_step/max_steps": "5911/6362"} +{"lm loss": 4.87636518, "grad_norm": 0.27605614, "learning_rate": 4.32e-06, "elapsed_time_per_iteration": 6.64689565, "memory(GiB)": 21.51, "elapsed_time": "10h 47m 29s", "remaining_time": "49m 17s", "loss_scale": 1.0, "consumed_samples": 1513472, "global_step/max_steps": "5912/6362"} +{"lm loss": 4.85886955, "grad_norm": 0.26522228, "learning_rate": 4.31e-06, "elapsed_time_per_iteration": 6.60087323, "memory(GiB)": 21.51, "elapsed_time": "10h 47m 35s", "remaining_time": "49m 10s", "loss_scale": 1.0, "consumed_samples": 1513728, "global_step/max_steps": "5913/6362"} +{"lm loss": 4.87701607, "grad_norm": 0.25863969, "learning_rate": 4.31e-06, "elapsed_time_per_iteration": 6.666574, "memory(GiB)": 21.51, "elapsed_time": "10h 47m 42s", "remaining_time": "49m 3s", "loss_scale": 1.0, "consumed_samples": 1513984, "global_step/max_steps": "5914/6362"} +{"lm loss": 4.85440063, "grad_norm": 0.26517934, "learning_rate": 4.3e-06, "elapsed_time_per_iteration": 6.53238726, "memory(GiB)": 21.51, "elapsed_time": "10h 47m 49s", "remaining_time": "48m 57s", "loss_scale": 1.0, "consumed_samples": 1514240, "global_step/max_steps": "5915/6362"} +{"lm loss": 4.87598801, "grad_norm": 0.27979445, "learning_rate": 4.3e-06, "elapsed_time_per_iteration": 6.6064322, "memory(GiB)": 21.51, "elapsed_time": "10h 47m 55s", "remaining_time": "48m 50s", "loss_scale": 1.0, "consumed_samples": 1514496, "global_step/max_steps": "5916/6362"} +{"lm loss": 4.87933397, "grad_norm": 0.27422184, "learning_rate": 4.29e-06, "elapsed_time_per_iteration": 6.5038681, "memory(GiB)": 21.51, "elapsed_time": "10h 48m 2s", "remaining_time": "48m 44s", "loss_scale": 1.0, "consumed_samples": 1514752, "global_step/max_steps": "5917/6362"} +{"lm loss": 4.85709333, "grad_norm": 0.25139442, "learning_rate": 4.29e-06, "elapsed_time_per_iteration": 6.55415058, "memory(GiB)": 21.51, "elapsed_time": "10h 48m 8s", "remaining_time": "48m 37s", "loss_scale": 1.0, "consumed_samples": 1515008, "global_step/max_steps": "5918/6362"} +{"lm loss": 4.87626314, "grad_norm": 0.26931912, "learning_rate": 4.28e-06, "elapsed_time_per_iteration": 6.69566822, "memory(GiB)": 21.51, "elapsed_time": "10h 48m 15s", "remaining_time": "48m 31s", "loss_scale": 1.0, "consumed_samples": 1515264, "global_step/max_steps": "5919/6362"} +{"lm loss": 4.87200356, "grad_norm": 0.27723715, "learning_rate": 4.27e-06, "elapsed_time_per_iteration": 6.5682385, "memory(GiB)": 21.51, "elapsed_time": "10h 48m 22s", "remaining_time": "48m 24s", "loss_scale": 1.0, "consumed_samples": 1515520, "global_step/max_steps": "5920/6362"} +{"lm loss": 4.87708998, "grad_norm": 0.28429753, "learning_rate": 4.27e-06, "elapsed_time_per_iteration": 6.75774336, "memory(GiB)": 21.51, "elapsed_time": "10h 48m 28s", "remaining_time": "48m 17s", "loss_scale": 1.0, "consumed_samples": 1515776, "global_step/max_steps": "5921/6362"} +{"lm loss": 4.87933731, "grad_norm": 0.27344, "learning_rate": 4.26e-06, "elapsed_time_per_iteration": 6.67950773, "memory(GiB)": 21.51, "elapsed_time": "10h 48m 35s", "remaining_time": "48m 11s", "loss_scale": 1.0, "consumed_samples": 1516032, "global_step/max_steps": "5922/6362"} +{"lm loss": 4.86861801, "grad_norm": 0.27547857, "learning_rate": 4.26e-06, "elapsed_time_per_iteration": 6.74447227, "memory(GiB)": 21.51, "elapsed_time": "10h 48m 42s", "remaining_time": "48m 4s", "loss_scale": 1.0, "consumed_samples": 1516288, "global_step/max_steps": "5923/6362"} +{"lm loss": 4.86806154, "grad_norm": 0.27337116, "learning_rate": 4.25e-06, "elapsed_time_per_iteration": 6.60053134, "memory(GiB)": 21.51, "elapsed_time": "10h 48m 48s", "remaining_time": "47m 58s", "loss_scale": 1.0, "consumed_samples": 1516544, "global_step/max_steps": "5924/6362"} +{"lm loss": 4.87409687, "grad_norm": 0.26877573, "learning_rate": 4.25e-06, "elapsed_time_per_iteration": 6.80369925, "memory(GiB)": 21.51, "elapsed_time": "10h 48m 55s", "remaining_time": "47m 51s", "loss_scale": 1.0, "consumed_samples": 1516800, "global_step/max_steps": "5925/6362"} +{"lm loss": 4.86154652, "grad_norm": 0.28270859, "learning_rate": 4.24e-06, "elapsed_time_per_iteration": 6.62880015, "memory(GiB)": 21.51, "elapsed_time": "10h 49m 2s", "remaining_time": "47m 45s", "loss_scale": 1.0, "consumed_samples": 1517056, "global_step/max_steps": "5926/6362"} +{"lm loss": 4.87253189, "grad_norm": 0.26201975, "learning_rate": 4.23e-06, "elapsed_time_per_iteration": 6.58714819, "memory(GiB)": 21.51, "elapsed_time": "10h 49m 8s", "remaining_time": "47m 38s", "loss_scale": 1.0, "consumed_samples": 1517312, "global_step/max_steps": "5927/6362"} +{"lm loss": 4.87813473, "grad_norm": 0.25626445, "learning_rate": 4.23e-06, "elapsed_time_per_iteration": 6.47946954, "memory(GiB)": 21.51, "elapsed_time": "10h 49m 15s", "remaining_time": "47m 31s", "loss_scale": 1.0, "consumed_samples": 1517568, "global_step/max_steps": "5928/6362"} +{"lm loss": 4.86043882, "grad_norm": 0.28108364, "learning_rate": 4.22e-06, "elapsed_time_per_iteration": 6.56075573, "memory(GiB)": 21.51, "elapsed_time": "10h 49m 21s", "remaining_time": "47m 25s", "loss_scale": 1.0, "consumed_samples": 1517824, "global_step/max_steps": "5929/6362"} +{"lm loss": 4.87993574, "grad_norm": 0.27104104, "learning_rate": 4.22e-06, "elapsed_time_per_iteration": 6.60414958, "memory(GiB)": 21.51, "elapsed_time": "10h 49m 28s", "remaining_time": "47m 18s", "loss_scale": 1.0, "consumed_samples": 1518080, "global_step/max_steps": "5930/6362"} +{"lm loss": 4.87854052, "grad_norm": 0.25882179, "learning_rate": 4.21e-06, "elapsed_time_per_iteration": 6.51336169, "memory(GiB)": 21.51, "elapsed_time": "10h 49m 35s", "remaining_time": "47m 12s", "loss_scale": 1.0, "consumed_samples": 1518336, "global_step/max_steps": "5931/6362"} +{"lm loss": 4.88299751, "grad_norm": 0.27184764, "learning_rate": 4.21e-06, "elapsed_time_per_iteration": 6.58267498, "memory(GiB)": 21.51, "elapsed_time": "10h 49m 41s", "remaining_time": "47m 5s", "loss_scale": 1.0, "consumed_samples": 1518592, "global_step/max_steps": "5932/6362"} +{"lm loss": 4.85989666, "grad_norm": 0.28332999, "learning_rate": 4.2e-06, "elapsed_time_per_iteration": 6.69893384, "memory(GiB)": 21.51, "elapsed_time": "10h 49m 48s", "remaining_time": "46m 59s", "loss_scale": 1.0, "consumed_samples": 1518848, "global_step/max_steps": "5933/6362"} +{"lm loss": 4.86898232, "grad_norm": 0.27314803, "learning_rate": 4.2e-06, "elapsed_time_per_iteration": 6.47531509, "memory(GiB)": 21.51, "elapsed_time": "10h 49m 54s", "remaining_time": "46m 52s", "loss_scale": 1.0, "consumed_samples": 1519104, "global_step/max_steps": "5934/6362"} +{"lm loss": 4.84760523, "grad_norm": 0.27702558, "learning_rate": 4.19e-06, "elapsed_time_per_iteration": 6.73665881, "memory(GiB)": 21.51, "elapsed_time": "10h 50m 1s", "remaining_time": "46m 46s", "loss_scale": 1.0, "consumed_samples": 1519360, "global_step/max_steps": "5935/6362"} +{"lm loss": 4.84561825, "grad_norm": 0.26272881, "learning_rate": 4.18e-06, "elapsed_time_per_iteration": 6.41131616, "memory(GiB)": 21.51, "elapsed_time": "10h 50m 7s", "remaining_time": "46m 39s", "loss_scale": 1.0, "consumed_samples": 1519616, "global_step/max_steps": "5936/6362"} +{"lm loss": 4.8636322, "grad_norm": 0.273148, "learning_rate": 4.18e-06, "elapsed_time_per_iteration": 6.53325129, "memory(GiB)": 21.51, "elapsed_time": "10h 50m 14s", "remaining_time": "46m 32s", "loss_scale": 1.0, "consumed_samples": 1519872, "global_step/max_steps": "5937/6362"} +{"lm loss": 4.85956812, "grad_norm": 0.26414567, "learning_rate": 4.17e-06, "elapsed_time_per_iteration": 6.49916339, "memory(GiB)": 21.51, "elapsed_time": "10h 50m 20s", "remaining_time": "46m 26s", "loss_scale": 1.0, "consumed_samples": 1520128, "global_step/max_steps": "5938/6362"} +{"lm loss": 4.84697151, "grad_norm": 0.27485949, "learning_rate": 4.17e-06, "elapsed_time_per_iteration": 6.72651982, "memory(GiB)": 21.51, "elapsed_time": "10h 50m 27s", "remaining_time": "46m 19s", "loss_scale": 1.0, "consumed_samples": 1520384, "global_step/max_steps": "5939/6362"} +{"lm loss": 4.87705374, "grad_norm": 0.26871562, "learning_rate": 4.16e-06, "elapsed_time_per_iteration": 6.55040646, "memory(GiB)": 21.51, "elapsed_time": "10h 50m 34s", "remaining_time": "46m 13s", "loss_scale": 1.0, "consumed_samples": 1520640, "global_step/max_steps": "5940/6362"} +{"lm loss": 4.85954189, "grad_norm": 0.25576121, "learning_rate": 4.16e-06, "elapsed_time_per_iteration": 6.41897202, "memory(GiB)": 21.51, "elapsed_time": "10h 50m 40s", "remaining_time": "46m 6s", "loss_scale": 1.0, "consumed_samples": 1520896, "global_step/max_steps": "5941/6362"} +{"lm loss": 4.85070848, "grad_norm": 0.2749401, "learning_rate": 4.15e-06, "elapsed_time_per_iteration": 6.56914759, "memory(GiB)": 21.51, "elapsed_time": "10h 50m 47s", "remaining_time": "45m 59s", "loss_scale": 1.0, "consumed_samples": 1521152, "global_step/max_steps": "5942/6362"} +{"lm loss": 4.86638832, "grad_norm": 0.25822368, "learning_rate": 4.15e-06, "elapsed_time_per_iteration": 6.53852701, "memory(GiB)": 21.51, "elapsed_time": "10h 50m 53s", "remaining_time": "45m 53s", "loss_scale": 1.0, "consumed_samples": 1521408, "global_step/max_steps": "5943/6362"} +{"lm loss": 4.86054945, "grad_norm": 0.26092452, "learning_rate": 4.14e-06, "elapsed_time_per_iteration": 6.62086153, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 0s", "remaining_time": "45m 46s", "loss_scale": 1.0, "consumed_samples": 1521664, "global_step/max_steps": "5944/6362"} +{"lm loss": 4.86377239, "grad_norm": 0.25750941, "learning_rate": 4.13e-06, "elapsed_time_per_iteration": 6.51866817, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 6s", "remaining_time": "45m 40s", "loss_scale": 1.0, "consumed_samples": 1521920, "global_step/max_steps": "5945/6362"} +{"lm loss": 4.90297222, "grad_norm": 0.25550887, "learning_rate": 4.13e-06, "elapsed_time_per_iteration": 6.65543461, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 13s", "remaining_time": "45m 33s", "loss_scale": 1.0, "consumed_samples": 1522176, "global_step/max_steps": "5946/6362"} +{"lm loss": 4.83745813, "grad_norm": 0.27553025, "learning_rate": 4.12e-06, "elapsed_time_per_iteration": 6.45450306, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 20s", "remaining_time": "45m 27s", "loss_scale": 1.0, "consumed_samples": 1522432, "global_step/max_steps": "5947/6362"} +{"lm loss": 4.89233112, "grad_norm": 0.25457931, "learning_rate": 4.12e-06, "elapsed_time_per_iteration": 6.41414857, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 26s", "remaining_time": "45m 20s", "loss_scale": 1.0, "consumed_samples": 1522688, "global_step/max_steps": "5948/6362"} +{"lm loss": 4.85765123, "grad_norm": 0.26660138, "learning_rate": 4.11e-06, "elapsed_time_per_iteration": 6.38755751, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 32s", "remaining_time": "45m 13s", "loss_scale": 1.0, "consumed_samples": 1522944, "global_step/max_steps": "5949/6362"} +{"lm loss": 4.89174271, "grad_norm": 0.26525331, "learning_rate": 4.11e-06, "elapsed_time_per_iteration": 6.5082891, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 39s", "remaining_time": "45m 7s", "loss_scale": 1.0, "consumed_samples": 1523200, "global_step/max_steps": "5950/6362"} +{"lm loss": 4.85541773, "grad_norm": 0.26187772, "learning_rate": 4.1e-06, "elapsed_time_per_iteration": 6.5071578, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 45s", "remaining_time": "45m 0s", "loss_scale": 1.0, "consumed_samples": 1523456, "global_step/max_steps": "5951/6362"} +{"lm loss": 4.86950254, "grad_norm": 0.26223558, "learning_rate": 4.1e-06, "elapsed_time_per_iteration": 6.37444949, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 52s", "remaining_time": "44m 54s", "loss_scale": 1.0, "consumed_samples": 1523712, "global_step/max_steps": "5952/6362"} +{"lm loss": 4.86680746, "grad_norm": 0.26365972, "learning_rate": 4.09e-06, "elapsed_time_per_iteration": 6.49717593, "memory(GiB)": 21.51, "elapsed_time": "10h 51m 58s", "remaining_time": "44m 47s", "loss_scale": 1.0, "consumed_samples": 1523968, "global_step/max_steps": "5953/6362"} +{"lm loss": 4.90567493, "grad_norm": 0.27730232, "learning_rate": 4.09e-06, "elapsed_time_per_iteration": 6.62492776, "memory(GiB)": 21.51, "elapsed_time": "10h 52m 5s", "remaining_time": "44m 41s", "loss_scale": 1.0, "consumed_samples": 1524224, "global_step/max_steps": "5954/6362"} +{"lm loss": 4.88028908, "grad_norm": 0.26098484, "learning_rate": 4.08e-06, "elapsed_time_per_iteration": 6.52834749, "memory(GiB)": 21.51, "elapsed_time": "10h 52m 11s", "remaining_time": "44m 34s", "loss_scale": 1.0, "consumed_samples": 1524480, "global_step/max_steps": "5955/6362"} +{"lm loss": 4.86929655, "grad_norm": 0.26989254, "learning_rate": 4.08e-06, "elapsed_time_per_iteration": 6.52447987, "memory(GiB)": 21.51, "elapsed_time": "10h 52m 18s", "remaining_time": "44m 27s", "loss_scale": 1.0, "consumed_samples": 1524736, "global_step/max_steps": "5956/6362"} +{"lm loss": 4.84365654, "grad_norm": 0.26486427, "learning_rate": 4.07e-06, "elapsed_time_per_iteration": 6.52446914, "memory(GiB)": 21.51, "elapsed_time": "10h 52m 24s", "remaining_time": "44m 21s", "loss_scale": 1.0, "consumed_samples": 1524992, "global_step/max_steps": "5957/6362"} +{"lm loss": 4.86150408, "grad_norm": 0.26946017, "learning_rate": 4.07e-06, "elapsed_time_per_iteration": 6.5951674, "memory(GiB)": 21.51, "elapsed_time": "10h 52m 31s", "remaining_time": "44m 14s", "loss_scale": 1.0, "consumed_samples": 1525248, "global_step/max_steps": "5958/6362"} +{"lm loss": 4.86412907, "grad_norm": 0.27760997, "learning_rate": 4.06e-06, "elapsed_time_per_iteration": 6.62714672, "memory(GiB)": 21.51, "elapsed_time": "10h 52m 38s", "remaining_time": "44m 8s", "loss_scale": 1.0, "consumed_samples": 1525504, "global_step/max_steps": "5959/6362"} +{"lm loss": 4.8767848, "grad_norm": 0.27801782, "learning_rate": 4.05e-06, "elapsed_time_per_iteration": 6.42500305, "memory(GiB)": 21.51, "elapsed_time": "10h 52m 44s", "remaining_time": "44m 1s", "loss_scale": 1.0, "consumed_samples": 1525760, "global_step/max_steps": "5960/6362"} +{"lm loss": 4.85369539, "grad_norm": 0.26257664, "learning_rate": 4.05e-06, "elapsed_time_per_iteration": 6.5888021, "memory(GiB)": 21.51, "elapsed_time": "10h 52m 51s", "remaining_time": "43m 55s", "loss_scale": 1.0, "consumed_samples": 1526016, "global_step/max_steps": "5961/6362"} +{"lm loss": 4.86738348, "grad_norm": 0.25351402, "learning_rate": 4.04e-06, "elapsed_time_per_iteration": 6.41749644, "memory(GiB)": 21.51, "elapsed_time": "10h 52m 57s", "remaining_time": "43m 48s", "loss_scale": 1.0, "consumed_samples": 1526272, "global_step/max_steps": "5962/6362"} +{"lm loss": 4.89742804, "grad_norm": 0.27040607, "learning_rate": 4.04e-06, "elapsed_time_per_iteration": 6.67616677, "memory(GiB)": 21.51, "elapsed_time": "10h 53m 4s", "remaining_time": "43m 41s", "loss_scale": 1.0, "consumed_samples": 1526528, "global_step/max_steps": "5963/6362"} +{"lm loss": 4.89879751, "grad_norm": 0.27640575, "learning_rate": 4.03e-06, "elapsed_time_per_iteration": 6.71735191, "memory(GiB)": 21.51, "elapsed_time": "10h 53m 10s", "remaining_time": "43m 35s", "loss_scale": 1.0, "consumed_samples": 1526784, "global_step/max_steps": "5964/6362"} +{"lm loss": 4.86647987, "grad_norm": 0.26073107, "learning_rate": 4.03e-06, "elapsed_time_per_iteration": 6.58108401, "memory(GiB)": 21.51, "elapsed_time": "10h 53m 17s", "remaining_time": "43m 28s", "loss_scale": 1.0, "consumed_samples": 1527040, "global_step/max_steps": "5965/6362"} +{"lm loss": 4.85764551, "grad_norm": 0.26327467, "learning_rate": 4.02e-06, "elapsed_time_per_iteration": 6.34051585, "memory(GiB)": 21.51, "elapsed_time": "10h 53m 23s", "remaining_time": "43m 22s", "loss_scale": 1.0, "consumed_samples": 1527296, "global_step/max_steps": "5966/6362"} +{"lm loss": 4.87298393, "grad_norm": 0.27101359, "learning_rate": 4.02e-06, "elapsed_time_per_iteration": 6.34267902, "memory(GiB)": 21.51, "elapsed_time": "10h 53m 30s", "remaining_time": "43m 15s", "loss_scale": 1.0, "consumed_samples": 1527552, "global_step/max_steps": "5967/6362"} +{"lm loss": 4.84187555, "grad_norm": 0.26711118, "learning_rate": 4.01e-06, "elapsed_time_per_iteration": 6.37074089, "memory(GiB)": 21.51, "elapsed_time": "10h 53m 36s", "remaining_time": "43m 9s", "loss_scale": 1.0, "consumed_samples": 1527808, "global_step/max_steps": "5968/6362"} +{"lm loss": 4.83932686, "grad_norm": 0.25443459, "learning_rate": 4.01e-06, "elapsed_time_per_iteration": 6.7044456, "memory(GiB)": 21.51, "elapsed_time": "10h 53m 43s", "remaining_time": "43m 2s", "loss_scale": 1.0, "consumed_samples": 1528064, "global_step/max_steps": "5969/6362"} +{"lm loss": 4.84607267, "grad_norm": 0.26840091, "learning_rate": 4e-06, "elapsed_time_per_iteration": 6.440557, "memory(GiB)": 21.51, "elapsed_time": "10h 53m 49s", "remaining_time": "42m 55s", "loss_scale": 1.0, "consumed_samples": 1528320, "global_step/max_steps": "5970/6362"} +{"lm loss": 4.86552048, "grad_norm": 0.26699564, "learning_rate": 4e-06, "elapsed_time_per_iteration": 6.35865569, "memory(GiB)": 21.51, "elapsed_time": "10h 53m 56s", "remaining_time": "42m 49s", "loss_scale": 1.0, "consumed_samples": 1528576, "global_step/max_steps": "5971/6362"} +{"lm loss": 4.83124495, "grad_norm": 0.25941288, "learning_rate": 3.99e-06, "elapsed_time_per_iteration": 6.5853045, "memory(GiB)": 21.51, "elapsed_time": "10h 54m 2s", "remaining_time": "42m 42s", "loss_scale": 1.0, "consumed_samples": 1528832, "global_step/max_steps": "5972/6362"} +{"lm loss": 4.86996126, "grad_norm": 0.25235119, "learning_rate": 3.99e-06, "elapsed_time_per_iteration": 6.68141198, "memory(GiB)": 21.51, "elapsed_time": "10h 54m 9s", "remaining_time": "42m 36s", "loss_scale": 1.0, "consumed_samples": 1529088, "global_step/max_steps": "5973/6362"} +{"lm loss": 4.84369564, "grad_norm": 0.26924574, "learning_rate": 3.98e-06, "elapsed_time_per_iteration": 6.49976254, "memory(GiB)": 21.51, "elapsed_time": "10h 54m 15s", "remaining_time": "42m 29s", "loss_scale": 1.0, "consumed_samples": 1529344, "global_step/max_steps": "5974/6362"} +{"lm loss": 4.85873556, "grad_norm": 0.25558266, "learning_rate": 3.98e-06, "elapsed_time_per_iteration": 6.49881959, "memory(GiB)": 21.51, "elapsed_time": "10h 54m 22s", "remaining_time": "42m 23s", "loss_scale": 1.0, "consumed_samples": 1529600, "global_step/max_steps": "5975/6362"} +{"lm loss": 4.8402257, "grad_norm": 0.25097749, "learning_rate": 3.97e-06, "elapsed_time_per_iteration": 6.54590082, "memory(GiB)": 21.51, "elapsed_time": "10h 54m 28s", "remaining_time": "42m 16s", "loss_scale": 1.0, "consumed_samples": 1529856, "global_step/max_steps": "5976/6362"} +{"lm loss": 4.86915827, "grad_norm": 0.26180327, "learning_rate": 3.97e-06, "elapsed_time_per_iteration": 6.58586597, "memory(GiB)": 21.51, "elapsed_time": "10h 54m 35s", "remaining_time": "42m 9s", "loss_scale": 1.0, "consumed_samples": 1530112, "global_step/max_steps": "5977/6362"} +{"lm loss": 4.86672258, "grad_norm": 0.26329386, "learning_rate": 3.96e-06, "elapsed_time_per_iteration": 6.35811591, "memory(GiB)": 21.51, "elapsed_time": "10h 54m 41s", "remaining_time": "42m 3s", "loss_scale": 1.0, "consumed_samples": 1530368, "global_step/max_steps": "5978/6362"} +{"lm loss": 4.86147928, "grad_norm": 0.25736892, "learning_rate": 3.96e-06, "elapsed_time_per_iteration": 6.55701876, "memory(GiB)": 21.51, "elapsed_time": "10h 54m 48s", "remaining_time": "41m 56s", "loss_scale": 1.0, "consumed_samples": 1530624, "global_step/max_steps": "5979/6362"} +{"lm loss": 4.85574675, "grad_norm": 0.25198844, "learning_rate": 3.95e-06, "elapsed_time_per_iteration": 6.40404534, "memory(GiB)": 21.51, "elapsed_time": "10h 54m 54s", "remaining_time": "41m 50s", "loss_scale": 1.0, "consumed_samples": 1530880, "global_step/max_steps": "5980/6362"} +{"lm loss": 4.86988354, "grad_norm": 0.25772566, "learning_rate": 3.95e-06, "elapsed_time_per_iteration": 6.6173892, "memory(GiB)": 21.51, "elapsed_time": "10h 55m 1s", "remaining_time": "41m 43s", "loss_scale": 1.0, "consumed_samples": 1531136, "global_step/max_steps": "5981/6362"} +{"lm loss": 4.85344076, "grad_norm": 0.25739014, "learning_rate": 3.94e-06, "elapsed_time_per_iteration": 6.58870029, "memory(GiB)": 21.51, "elapsed_time": "10h 55m 8s", "remaining_time": "41m 36s", "loss_scale": 1.0, "consumed_samples": 1531392, "global_step/max_steps": "5982/6362"} +{"lm loss": 4.87445641, "grad_norm": 0.26557478, "learning_rate": 3.94e-06, "elapsed_time_per_iteration": 6.62889385, "memory(GiB)": 21.51, "elapsed_time": "10h 55m 14s", "remaining_time": "41m 30s", "loss_scale": 1.0, "consumed_samples": 1531648, "global_step/max_steps": "5983/6362"} +{"lm loss": 4.86189795, "grad_norm": 0.25882941, "learning_rate": 3.93e-06, "elapsed_time_per_iteration": 6.65616632, "memory(GiB)": 21.51, "elapsed_time": "10h 55m 21s", "remaining_time": "41m 23s", "loss_scale": 1.0, "consumed_samples": 1531904, "global_step/max_steps": "5984/6362"} +{"lm loss": 4.85970974, "grad_norm": 0.25075203, "learning_rate": 3.93e-06, "elapsed_time_per_iteration": 6.65817285, "memory(GiB)": 21.51, "elapsed_time": "10h 55m 27s", "remaining_time": "41m 17s", "loss_scale": 1.0, "consumed_samples": 1532160, "global_step/max_steps": "5985/6362"} +{"lm loss": 4.87046671, "grad_norm": 0.28284821, "learning_rate": 3.92e-06, "elapsed_time_per_iteration": 6.63750768, "memory(GiB)": 21.51, "elapsed_time": "10h 55m 34s", "remaining_time": "41m 10s", "loss_scale": 1.0, "consumed_samples": 1532416, "global_step/max_steps": "5986/6362"} +{"lm loss": 4.86603785, "grad_norm": 0.25978211, "learning_rate": 3.92e-06, "elapsed_time_per_iteration": 6.59731102, "memory(GiB)": 21.51, "elapsed_time": "10h 55m 41s", "remaining_time": "41m 4s", "loss_scale": 1.0, "consumed_samples": 1532672, "global_step/max_steps": "5987/6362"} +{"lm loss": 4.84837246, "grad_norm": 0.25004444, "learning_rate": 3.91e-06, "elapsed_time_per_iteration": 6.57207775, "memory(GiB)": 21.51, "elapsed_time": "10h 55m 47s", "remaining_time": "40m 57s", "loss_scale": 1.0, "consumed_samples": 1532928, "global_step/max_steps": "5988/6362"} +{"lm loss": 4.86449337, "grad_norm": 0.26209822, "learning_rate": 3.91e-06, "elapsed_time_per_iteration": 6.48218942, "memory(GiB)": 21.51, "elapsed_time": "10h 55m 54s", "remaining_time": "40m 51s", "loss_scale": 1.0, "consumed_samples": 1533184, "global_step/max_steps": "5989/6362"} +{"lm loss": 4.85185242, "grad_norm": 0.25983745, "learning_rate": 3.9e-06, "elapsed_time_per_iteration": 6.42759395, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 0s", "remaining_time": "40m 44s", "loss_scale": 1.0, "consumed_samples": 1533440, "global_step/max_steps": "5990/6362"} +{"lm loss": 4.86444473, "grad_norm": 0.25842208, "learning_rate": 3.9e-06, "elapsed_time_per_iteration": 6.56490898, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 7s", "remaining_time": "40m 37s", "loss_scale": 1.0, "consumed_samples": 1533696, "global_step/max_steps": "5991/6362"} +{"lm loss": 4.83986139, "grad_norm": 0.25343317, "learning_rate": 3.89e-06, "elapsed_time_per_iteration": 6.52056766, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 13s", "remaining_time": "40m 31s", "loss_scale": 1.0, "consumed_samples": 1533952, "global_step/max_steps": "5992/6362"} +{"lm loss": 4.87436438, "grad_norm": 0.25235152, "learning_rate": 3.89e-06, "elapsed_time_per_iteration": 6.5363338, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 20s", "remaining_time": "40m 24s", "loss_scale": 1.0, "consumed_samples": 1534208, "global_step/max_steps": "5993/6362"} +{"lm loss": 4.86762381, "grad_norm": 0.26683891, "learning_rate": 3.88e-06, "elapsed_time_per_iteration": 6.58743715, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 26s", "remaining_time": "40m 18s", "loss_scale": 1.0, "consumed_samples": 1534464, "global_step/max_steps": "5994/6362"} +{"lm loss": 4.88980865, "grad_norm": 0.25106478, "learning_rate": 3.88e-06, "elapsed_time_per_iteration": 6.54871702, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 33s", "remaining_time": "40m 11s", "loss_scale": 1.0, "consumed_samples": 1534720, "global_step/max_steps": "5995/6362"} +{"lm loss": 4.86151886, "grad_norm": 0.25495082, "learning_rate": 3.88e-06, "elapsed_time_per_iteration": 6.52120972, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 39s", "remaining_time": "40m 5s", "loss_scale": 1.0, "consumed_samples": 1534976, "global_step/max_steps": "5996/6362"} +{"lm loss": 4.88580179, "grad_norm": 0.25849706, "learning_rate": 3.87e-06, "elapsed_time_per_iteration": 6.60550785, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 46s", "remaining_time": "39m 58s", "loss_scale": 1.0, "consumed_samples": 1535232, "global_step/max_steps": "5997/6362"} +{"lm loss": 4.83643866, "grad_norm": 0.25327486, "learning_rate": 3.87e-06, "elapsed_time_per_iteration": 6.69985056, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 53s", "remaining_time": "39m 51s", "loss_scale": 1.0, "consumed_samples": 1535488, "global_step/max_steps": "5998/6362"} +{"lm loss": 4.87689018, "grad_norm": 0.26303154, "learning_rate": 3.86e-06, "elapsed_time_per_iteration": 6.66959214, "memory(GiB)": 21.51, "elapsed_time": "10h 56m 59s", "remaining_time": "39m 45s", "loss_scale": 1.0, "consumed_samples": 1535744, "global_step/max_steps": "5999/6362"} +{"lm loss": 4.90839863, "grad_norm": 0.25999603, "learning_rate": 3.86e-06, "elapsed_time_per_iteration": 6.56248093, "memory(GiB)": 21.51, "elapsed_time": "10h 57m 6s", "remaining_time": "39m 38s", "loss_scale": 1.0, "consumed_samples": 1536000, "global_step/max_steps": "6000/6362"} +{"lm loss": 4.85100412, "grad_norm": 0.25847018, "learning_rate": 3.85e-06, "elapsed_time_per_iteration": 6.85321021, "memory(GiB)": 21.51, "elapsed_time": "10h 57m 13s", "remaining_time": "39m 32s", "loss_scale": 1.0, "consumed_samples": 1536256, "global_step/max_steps": "6001/6362"} +{"lm loss": 4.8618412, "grad_norm": 0.26391077, "learning_rate": 3.85e-06, "elapsed_time_per_iteration": 6.82913756, "memory(GiB)": 21.51, "elapsed_time": "10h 57m 20s", "remaining_time": "39m 25s", "loss_scale": 1.0, "consumed_samples": 1536512, "global_step/max_steps": "6002/6362"} +{"lm loss": 4.84449387, "grad_norm": 0.26096854, "learning_rate": 3.84e-06, "elapsed_time_per_iteration": 6.72072172, "memory(GiB)": 21.51, "elapsed_time": "10h 57m 26s", "remaining_time": "39m 19s", "loss_scale": 1.0, "consumed_samples": 1536768, "global_step/max_steps": "6003/6362"} +{"lm loss": 4.8514266, "grad_norm": 0.27143151, "learning_rate": 3.84e-06, "elapsed_time_per_iteration": 6.49686337, "memory(GiB)": 21.51, "elapsed_time": "10h 57m 33s", "remaining_time": "39m 12s", "loss_scale": 1.0, "consumed_samples": 1537024, "global_step/max_steps": "6004/6362"} +{"lm loss": 4.85354519, "grad_norm": 0.26760402, "learning_rate": 3.83e-06, "elapsed_time_per_iteration": 6.58620262, "memory(GiB)": 21.51, "elapsed_time": "10h 57m 39s", "remaining_time": "39m 5s", "loss_scale": 1.0, "consumed_samples": 1537280, "global_step/max_steps": "6005/6362"} +{"lm loss": 4.86928511, "grad_norm": 0.27436244, "learning_rate": 3.83e-06, "elapsed_time_per_iteration": 6.60393476, "memory(GiB)": 21.51, "elapsed_time": "10h 57m 46s", "remaining_time": "38m 59s", "loss_scale": 1.0, "consumed_samples": 1537536, "global_step/max_steps": "6006/6362"} +{"lm loss": 4.86842394, "grad_norm": 0.27582675, "learning_rate": 3.82e-06, "elapsed_time_per_iteration": 6.56670833, "memory(GiB)": 21.51, "elapsed_time": "10h 57m 53s", "remaining_time": "38m 52s", "loss_scale": 1.0, "consumed_samples": 1537792, "global_step/max_steps": "6007/6362"} +{"lm loss": 4.84885073, "grad_norm": 0.26298401, "learning_rate": 3.82e-06, "elapsed_time_per_iteration": 6.6802516, "memory(GiB)": 21.51, "elapsed_time": "10h 57m 59s", "remaining_time": "38m 46s", "loss_scale": 1.0, "consumed_samples": 1538048, "global_step/max_steps": "6008/6362"} +{"lm loss": 4.86615276, "grad_norm": 0.26955557, "learning_rate": 3.81e-06, "elapsed_time_per_iteration": 6.73675895, "memory(GiB)": 21.51, "elapsed_time": "10h 58m 6s", "remaining_time": "38m 39s", "loss_scale": 1.0, "consumed_samples": 1538304, "global_step/max_steps": "6009/6362"} +{"lm loss": 4.88122272, "grad_norm": 0.28004292, "learning_rate": 3.81e-06, "elapsed_time_per_iteration": 6.70159078, "memory(GiB)": 21.51, "elapsed_time": "10h 58m 13s", "remaining_time": "38m 33s", "loss_scale": 1.0, "consumed_samples": 1538560, "global_step/max_steps": "6010/6362"} +{"lm loss": 4.87625837, "grad_norm": 0.26242477, "learning_rate": 3.8e-06, "elapsed_time_per_iteration": 6.65712309, "memory(GiB)": 21.51, "elapsed_time": "10h 58m 19s", "remaining_time": "38m 26s", "loss_scale": 1.0, "consumed_samples": 1538816, "global_step/max_steps": "6011/6362"} +{"lm loss": 4.85169411, "grad_norm": 0.26349729, "learning_rate": 3.8e-06, "elapsed_time_per_iteration": 6.69193625, "memory(GiB)": 21.51, "elapsed_time": "10h 58m 26s", "remaining_time": "38m 19s", "loss_scale": 1.0, "consumed_samples": 1539072, "global_step/max_steps": "6012/6362"} +{"lm loss": 4.86484385, "grad_norm": 0.26991302, "learning_rate": 3.8e-06, "elapsed_time_per_iteration": 6.72875047, "memory(GiB)": 21.51, "elapsed_time": "10h 58m 33s", "remaining_time": "38m 13s", "loss_scale": 1.0, "consumed_samples": 1539328, "global_step/max_steps": "6013/6362"} +{"lm loss": 4.87326145, "grad_norm": 0.26652682, "learning_rate": 3.79e-06, "elapsed_time_per_iteration": 6.77409577, "memory(GiB)": 21.51, "elapsed_time": "10h 58m 40s", "remaining_time": "38m 6s", "loss_scale": 1.0, "consumed_samples": 1539584, "global_step/max_steps": "6014/6362"} +{"lm loss": 4.8499651, "grad_norm": 0.26844218, "learning_rate": 3.79e-06, "elapsed_time_per_iteration": 6.47091055, "memory(GiB)": 21.51, "elapsed_time": "10h 58m 46s", "remaining_time": "38m 0s", "loss_scale": 1.0, "consumed_samples": 1539840, "global_step/max_steps": "6015/6362"} +{"lm loss": 4.87061644, "grad_norm": 0.25863564, "learning_rate": 3.78e-06, "elapsed_time_per_iteration": 6.47020435, "memory(GiB)": 21.51, "elapsed_time": "10h 58m 53s", "remaining_time": "37m 53s", "loss_scale": 1.0, "consumed_samples": 1540096, "global_step/max_steps": "6016/6362"} +{"lm loss": 4.89352274, "grad_norm": 0.25993371, "learning_rate": 3.78e-06, "elapsed_time_per_iteration": 8.15748405, "memory(GiB)": 21.51, "elapsed_time": "10h 59m 1s", "remaining_time": "37m 47s", "loss_scale": 1.0, "consumed_samples": 1540352, "global_step/max_steps": "6017/6362"} +{"lm loss": 4.88681793, "grad_norm": 0.27350593, "learning_rate": 3.77e-06, "elapsed_time_per_iteration": 6.74761677, "memory(GiB)": 21.51, "elapsed_time": "10h 59m 7s", "remaining_time": "37m 40s", "loss_scale": 1.0, "consumed_samples": 1540608, "global_step/max_steps": "6018/6362"} +{"lm loss": 4.86329651, "grad_norm": 0.24753171, "learning_rate": 3.77e-06, "elapsed_time_per_iteration": 6.58944154, "memory(GiB)": 21.51, "elapsed_time": "10h 59m 14s", "remaining_time": "37m 34s", "loss_scale": 1.0, "consumed_samples": 1540864, "global_step/max_steps": "6019/6362"} +{"lm loss": 4.88193655, "grad_norm": 0.26233679, "learning_rate": 3.76e-06, "elapsed_time_per_iteration": 6.54440737, "memory(GiB)": 21.51, "elapsed_time": "10h 59m 21s", "remaining_time": "37m 27s", "loss_scale": 1.0, "consumed_samples": 1541120, "global_step/max_steps": "6020/6362"} +{"lm loss": 4.88798237, "grad_norm": 0.26326948, "learning_rate": 3.76e-06, "elapsed_time_per_iteration": 6.53704524, "memory(GiB)": 21.51, "elapsed_time": "10h 59m 27s", "remaining_time": "37m 20s", "loss_scale": 1.0, "consumed_samples": 1541376, "global_step/max_steps": "6021/6362"} +{"lm loss": 4.88168764, "grad_norm": 0.25382781, "learning_rate": 3.76e-06, "elapsed_time_per_iteration": 6.69952464, "memory(GiB)": 21.51, "elapsed_time": "10h 59m 34s", "remaining_time": "37m 14s", "loss_scale": 1.0, "consumed_samples": 1541632, "global_step/max_steps": "6022/6362"} +{"lm loss": 4.87615299, "grad_norm": 0.25851414, "learning_rate": 3.75e-06, "elapsed_time_per_iteration": 6.74082661, "memory(GiB)": 21.51, "elapsed_time": "10h 59m 41s", "remaining_time": "37m 7s", "loss_scale": 1.0, "consumed_samples": 1541888, "global_step/max_steps": "6023/6362"} +{"lm loss": 4.87404966, "grad_norm": 0.25013176, "learning_rate": 3.75e-06, "elapsed_time_per_iteration": 6.71474457, "memory(GiB)": 21.51, "elapsed_time": "10h 59m 47s", "remaining_time": "37m 1s", "loss_scale": 1.0, "consumed_samples": 1542144, "global_step/max_steps": "6024/6362"} +{"lm loss": 4.87203312, "grad_norm": 0.26157007, "learning_rate": 3.74e-06, "elapsed_time_per_iteration": 6.54479074, "memory(GiB)": 21.51, "elapsed_time": "10h 59m 54s", "remaining_time": "36m 54s", "loss_scale": 1.0, "consumed_samples": 1542400, "global_step/max_steps": "6025/6362"} +{"lm loss": 4.87754726, "grad_norm": 0.25686926, "learning_rate": 3.74e-06, "elapsed_time_per_iteration": 6.6299789, "memory(GiB)": 21.51, "elapsed_time": "11h 0m 0s", "remaining_time": "36m 48s", "loss_scale": 1.0, "consumed_samples": 1542656, "global_step/max_steps": "6026/6362"} +{"lm loss": 4.87199974, "grad_norm": 0.25708085, "learning_rate": 3.73e-06, "elapsed_time_per_iteration": 6.56125903, "memory(GiB)": 21.51, "elapsed_time": "11h 0m 7s", "remaining_time": "36m 41s", "loss_scale": 1.0, "consumed_samples": 1542912, "global_step/max_steps": "6027/6362"} +{"lm loss": 4.86732435, "grad_norm": 0.27132845, "learning_rate": 3.73e-06, "elapsed_time_per_iteration": 6.6072526, "memory(GiB)": 21.51, "elapsed_time": "11h 0m 14s", "remaining_time": "36m 34s", "loss_scale": 1.0, "consumed_samples": 1543168, "global_step/max_steps": "6028/6362"} +{"lm loss": 4.8791461, "grad_norm": 0.25924096, "learning_rate": 3.72e-06, "elapsed_time_per_iteration": 6.82170868, "memory(GiB)": 21.51, "elapsed_time": "11h 0m 20s", "remaining_time": "36m 28s", "loss_scale": 1.0, "consumed_samples": 1543424, "global_step/max_steps": "6029/6362"} +{"lm loss": 4.85669327, "grad_norm": 0.24952954, "learning_rate": 3.72e-06, "elapsed_time_per_iteration": 6.60043406, "memory(GiB)": 21.51, "elapsed_time": "11h 0m 27s", "remaining_time": "36m 21s", "loss_scale": 1.0, "consumed_samples": 1543680, "global_step/max_steps": "6030/6362"} +{"lm loss": 4.85600042, "grad_norm": 0.25838029, "learning_rate": 3.72e-06, "elapsed_time_per_iteration": 6.6834209, "memory(GiB)": 21.51, "elapsed_time": "11h 0m 34s", "remaining_time": "36m 15s", "loss_scale": 1.0, "consumed_samples": 1543936, "global_step/max_steps": "6031/6362"} +{"lm loss": 4.88553619, "grad_norm": 0.25656223, "learning_rate": 3.71e-06, "elapsed_time_per_iteration": 6.57822084, "memory(GiB)": 21.51, "elapsed_time": "11h 0m 40s", "remaining_time": "36m 8s", "loss_scale": 1.0, "consumed_samples": 1544192, "global_step/max_steps": "6032/6362"} +{"lm loss": 4.87570763, "grad_norm": 0.25804055, "learning_rate": 3.71e-06, "elapsed_time_per_iteration": 6.54468036, "memory(GiB)": 21.51, "elapsed_time": "11h 0m 47s", "remaining_time": "36m 2s", "loss_scale": 1.0, "consumed_samples": 1544448, "global_step/max_steps": "6033/6362"} +{"lm loss": 4.84976149, "grad_norm": 0.26211315, "learning_rate": 3.7e-06, "elapsed_time_per_iteration": 6.619385, "memory(GiB)": 21.51, "elapsed_time": "11h 0m 53s", "remaining_time": "35m 55s", "loss_scale": 1.0, "consumed_samples": 1544704, "global_step/max_steps": "6034/6362"} +{"lm loss": 4.8864665, "grad_norm": 0.24925445, "learning_rate": 3.7e-06, "elapsed_time_per_iteration": 6.49086809, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 0s", "remaining_time": "35m 48s", "loss_scale": 1.0, "consumed_samples": 1544960, "global_step/max_steps": "6035/6362"} +{"lm loss": 4.8507905, "grad_norm": 0.26236683, "learning_rate": 3.69e-06, "elapsed_time_per_iteration": 6.74611712, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 7s", "remaining_time": "35m 42s", "loss_scale": 1.0, "consumed_samples": 1545216, "global_step/max_steps": "6036/6362"} +{"lm loss": 4.85144663, "grad_norm": 0.2526421, "learning_rate": 3.69e-06, "elapsed_time_per_iteration": 6.660923, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 13s", "remaining_time": "35m 35s", "loss_scale": 1.0, "consumed_samples": 1545472, "global_step/max_steps": "6037/6362"} +{"lm loss": 4.8782444, "grad_norm": 0.24410093, "learning_rate": 3.69e-06, "elapsed_time_per_iteration": 6.4339025, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 20s", "remaining_time": "35m 29s", "loss_scale": 1.0, "consumed_samples": 1545728, "global_step/max_steps": "6038/6362"} +{"lm loss": 4.87820148, "grad_norm": 0.26384953, "learning_rate": 3.68e-06, "elapsed_time_per_iteration": 6.53538775, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 26s", "remaining_time": "35m 22s", "loss_scale": 1.0, "consumed_samples": 1545984, "global_step/max_steps": "6039/6362"} +{"lm loss": 4.86026525, "grad_norm": 0.25979182, "learning_rate": 3.68e-06, "elapsed_time_per_iteration": 6.69724441, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 33s", "remaining_time": "35m 16s", "loss_scale": 1.0, "consumed_samples": 1546240, "global_step/max_steps": "6040/6362"} +{"lm loss": 4.87152243, "grad_norm": 0.26190642, "learning_rate": 3.67e-06, "elapsed_time_per_iteration": 6.58817172, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 40s", "remaining_time": "35m 9s", "loss_scale": 1.0, "consumed_samples": 1546496, "global_step/max_steps": "6041/6362"} +{"lm loss": 4.87114239, "grad_norm": 0.25370061, "learning_rate": 3.67e-06, "elapsed_time_per_iteration": 6.56383991, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 46s", "remaining_time": "35m 2s", "loss_scale": 1.0, "consumed_samples": 1546752, "global_step/max_steps": "6042/6362"} +{"lm loss": 4.86343431, "grad_norm": 0.25120768, "learning_rate": 3.67e-06, "elapsed_time_per_iteration": 6.65309978, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 53s", "remaining_time": "34m 56s", "loss_scale": 1.0, "consumed_samples": 1547008, "global_step/max_steps": "6043/6362"} +{"lm loss": 4.87127447, "grad_norm": 0.26844844, "learning_rate": 3.66e-06, "elapsed_time_per_iteration": 6.58441758, "memory(GiB)": 21.51, "elapsed_time": "11h 1m 59s", "remaining_time": "34m 49s", "loss_scale": 1.0, "consumed_samples": 1547264, "global_step/max_steps": "6044/6362"} +{"lm loss": 4.84228992, "grad_norm": 0.25519782, "learning_rate": 3.66e-06, "elapsed_time_per_iteration": 6.6943326, "memory(GiB)": 21.51, "elapsed_time": "11h 2m 6s", "remaining_time": "34m 43s", "loss_scale": 1.0, "consumed_samples": 1547520, "global_step/max_steps": "6045/6362"} +{"lm loss": 4.87499189, "grad_norm": 0.26319641, "learning_rate": 3.65e-06, "elapsed_time_per_iteration": 6.56085491, "memory(GiB)": 21.51, "elapsed_time": "11h 2m 13s", "remaining_time": "34m 36s", "loss_scale": 1.0, "consumed_samples": 1547776, "global_step/max_steps": "6046/6362"} +{"lm loss": 4.87878847, "grad_norm": 0.26173353, "learning_rate": 3.65e-06, "elapsed_time_per_iteration": 6.47142529, "memory(GiB)": 21.51, "elapsed_time": "11h 2m 19s", "remaining_time": "34m 30s", "loss_scale": 1.0, "consumed_samples": 1548032, "global_step/max_steps": "6047/6362"} +{"lm loss": 4.86374283, "grad_norm": 0.26494238, "learning_rate": 3.64e-06, "elapsed_time_per_iteration": 6.42427325, "memory(GiB)": 21.51, "elapsed_time": "11h 2m 26s", "remaining_time": "34m 23s", "loss_scale": 1.0, "consumed_samples": 1548288, "global_step/max_steps": "6048/6362"} +{"lm loss": 4.86517191, "grad_norm": 0.25519389, "learning_rate": 3.64e-06, "elapsed_time_per_iteration": 6.59598851, "memory(GiB)": 21.51, "elapsed_time": "11h 2m 32s", "remaining_time": "34m 16s", "loss_scale": 1.0, "consumed_samples": 1548544, "global_step/max_steps": "6049/6362"} +{"lm loss": 4.85415316, "grad_norm": 0.27364802, "learning_rate": 3.64e-06, "elapsed_time_per_iteration": 6.41280842, "memory(GiB)": 21.51, "elapsed_time": "11h 2m 39s", "remaining_time": "34m 10s", "loss_scale": 1.0, "consumed_samples": 1548800, "global_step/max_steps": "6050/6362"} +{"lm loss": 4.88107395, "grad_norm": 0.25890672, "learning_rate": 3.63e-06, "elapsed_time_per_iteration": 6.45599556, "memory(GiB)": 21.51, "elapsed_time": "11h 2m 45s", "remaining_time": "34m 3s", "loss_scale": 1.0, "consumed_samples": 1549056, "global_step/max_steps": "6051/6362"} +{"lm loss": 4.8848815, "grad_norm": 0.24840589, "learning_rate": 3.63e-06, "elapsed_time_per_iteration": 6.44171882, "memory(GiB)": 21.51, "elapsed_time": "11h 2m 52s", "remaining_time": "33m 57s", "loss_scale": 1.0, "consumed_samples": 1549312, "global_step/max_steps": "6052/6362"} +{"lm loss": 4.86152172, "grad_norm": 0.25016177, "learning_rate": 3.62e-06, "elapsed_time_per_iteration": 6.4427309, "memory(GiB)": 21.51, "elapsed_time": "11h 2m 58s", "remaining_time": "33m 50s", "loss_scale": 1.0, "consumed_samples": 1549568, "global_step/max_steps": "6053/6362"} +{"lm loss": 4.87041759, "grad_norm": 0.26546106, "learning_rate": 3.62e-06, "elapsed_time_per_iteration": 6.55948424, "memory(GiB)": 21.51, "elapsed_time": "11h 3m 5s", "remaining_time": "33m 44s", "loss_scale": 1.0, "consumed_samples": 1549824, "global_step/max_steps": "6054/6362"} +{"lm loss": 4.87177086, "grad_norm": 0.28122282, "learning_rate": 3.62e-06, "elapsed_time_per_iteration": 6.68986917, "memory(GiB)": 21.51, "elapsed_time": "11h 3m 11s", "remaining_time": "33m 37s", "loss_scale": 1.0, "consumed_samples": 1550080, "global_step/max_steps": "6055/6362"} +{"lm loss": 4.88375044, "grad_norm": 0.257631, "learning_rate": 3.61e-06, "elapsed_time_per_iteration": 6.54571819, "memory(GiB)": 21.51, "elapsed_time": "11h 3m 18s", "remaining_time": "33m 30s", "loss_scale": 1.0, "consumed_samples": 1550336, "global_step/max_steps": "6056/6362"} +{"lm loss": 4.85424614, "grad_norm": 0.25940445, "learning_rate": 3.61e-06, "elapsed_time_per_iteration": 6.53565693, "memory(GiB)": 21.51, "elapsed_time": "11h 3m 24s", "remaining_time": "33m 24s", "loss_scale": 1.0, "consumed_samples": 1550592, "global_step/max_steps": "6057/6362"} +{"lm loss": 4.88122559, "grad_norm": 0.25250411, "learning_rate": 3.6e-06, "elapsed_time_per_iteration": 6.85456824, "memory(GiB)": 21.51, "elapsed_time": "11h 3m 31s", "remaining_time": "33m 17s", "loss_scale": 1.0, "consumed_samples": 1550848, "global_step/max_steps": "6058/6362"} +{"lm loss": 4.87510395, "grad_norm": 0.26292247, "learning_rate": 3.6e-06, "elapsed_time_per_iteration": 6.51724291, "memory(GiB)": 21.51, "elapsed_time": "11h 3m 38s", "remaining_time": "33m 11s", "loss_scale": 1.0, "consumed_samples": 1551104, "global_step/max_steps": "6059/6362"} +{"lm loss": 4.85728788, "grad_norm": 0.25393304, "learning_rate": 3.6e-06, "elapsed_time_per_iteration": 6.6627748, "memory(GiB)": 21.51, "elapsed_time": "11h 3m 44s", "remaining_time": "33m 4s", "loss_scale": 1.0, "consumed_samples": 1551360, "global_step/max_steps": "6060/6362"} +{"lm loss": 4.8599124, "grad_norm": 0.27426323, "learning_rate": 3.59e-06, "elapsed_time_per_iteration": 6.46049118, "memory(GiB)": 21.51, "elapsed_time": "11h 3m 51s", "remaining_time": "32m 58s", "loss_scale": 1.0, "consumed_samples": 1551616, "global_step/max_steps": "6061/6362"} +{"lm loss": 4.86042786, "grad_norm": 0.26373225, "learning_rate": 3.59e-06, "elapsed_time_per_iteration": 6.60065842, "memory(GiB)": 21.51, "elapsed_time": "11h 3m 57s", "remaining_time": "32m 51s", "loss_scale": 1.0, "consumed_samples": 1551872, "global_step/max_steps": "6062/6362"} +{"lm loss": 4.85586405, "grad_norm": 0.26946342, "learning_rate": 3.58e-06, "elapsed_time_per_iteration": 6.65101266, "memory(GiB)": 21.51, "elapsed_time": "11h 4m 4s", "remaining_time": "32m 44s", "loss_scale": 1.0, "consumed_samples": 1552128, "global_step/max_steps": "6063/6362"} +{"lm loss": 4.87896633, "grad_norm": 0.24515809, "learning_rate": 3.58e-06, "elapsed_time_per_iteration": 6.66897082, "memory(GiB)": 21.51, "elapsed_time": "11h 4m 11s", "remaining_time": "32m 38s", "loss_scale": 1.0, "consumed_samples": 1552384, "global_step/max_steps": "6064/6362"} +{"lm loss": 4.87573147, "grad_norm": 0.28102434, "learning_rate": 3.58e-06, "elapsed_time_per_iteration": 6.49718928, "memory(GiB)": 21.51, "elapsed_time": "11h 4m 17s", "remaining_time": "32m 31s", "loss_scale": 1.0, "consumed_samples": 1552640, "global_step/max_steps": "6065/6362"} +{"lm loss": 4.84023714, "grad_norm": 0.25944528, "learning_rate": 3.57e-06, "elapsed_time_per_iteration": 6.58677316, "memory(GiB)": 21.51, "elapsed_time": "11h 4m 24s", "remaining_time": "32m 25s", "loss_scale": 1.0, "consumed_samples": 1552896, "global_step/max_steps": "6066/6362"} +{"lm loss": 4.87146711, "grad_norm": 0.26153657, "learning_rate": 3.57e-06, "elapsed_time_per_iteration": 6.420331, "memory(GiB)": 21.51, "elapsed_time": "11h 4m 30s", "remaining_time": "32m 18s", "loss_scale": 1.0, "consumed_samples": 1553152, "global_step/max_steps": "6067/6362"} +{"lm loss": 4.84770155, "grad_norm": 0.25637713, "learning_rate": 3.57e-06, "elapsed_time_per_iteration": 6.57850552, "memory(GiB)": 21.51, "elapsed_time": "11h 4m 37s", "remaining_time": "32m 12s", "loss_scale": 1.0, "consumed_samples": 1553408, "global_step/max_steps": "6068/6362"} +{"lm loss": 4.87091494, "grad_norm": 0.26939905, "learning_rate": 3.56e-06, "elapsed_time_per_iteration": 6.4418354, "memory(GiB)": 21.51, "elapsed_time": "11h 4m 43s", "remaining_time": "32m 5s", "loss_scale": 1.0, "consumed_samples": 1553664, "global_step/max_steps": "6069/6362"} +{"lm loss": 4.87285852, "grad_norm": 0.26061636, "learning_rate": 3.56e-06, "elapsed_time_per_iteration": 6.56644702, "memory(GiB)": 21.51, "elapsed_time": "11h 4m 50s", "remaining_time": "31m 58s", "loss_scale": 1.0, "consumed_samples": 1553920, "global_step/max_steps": "6070/6362"} +{"lm loss": 4.86838961, "grad_norm": 0.26378906, "learning_rate": 3.55e-06, "elapsed_time_per_iteration": 6.56216455, "memory(GiB)": 21.51, "elapsed_time": "11h 4m 56s", "remaining_time": "31m 52s", "loss_scale": 1.0, "consumed_samples": 1554176, "global_step/max_steps": "6071/6362"} +{"lm loss": 4.89344168, "grad_norm": 0.25712991, "learning_rate": 3.55e-06, "elapsed_time_per_iteration": 6.51716995, "memory(GiB)": 21.51, "elapsed_time": "11h 5m 3s", "remaining_time": "31m 45s", "loss_scale": 1.0, "consumed_samples": 1554432, "global_step/max_steps": "6072/6362"} +{"lm loss": 4.87555885, "grad_norm": 0.2615712, "learning_rate": 3.55e-06, "elapsed_time_per_iteration": 6.51085186, "memory(GiB)": 21.51, "elapsed_time": "11h 5m 9s", "remaining_time": "31m 39s", "loss_scale": 1.0, "consumed_samples": 1554688, "global_step/max_steps": "6073/6362"} +{"lm loss": 4.8722353, "grad_norm": 0.26372018, "learning_rate": 3.54e-06, "elapsed_time_per_iteration": 6.4983449, "memory(GiB)": 21.51, "elapsed_time": "11h 5m 16s", "remaining_time": "31m 32s", "loss_scale": 1.0, "consumed_samples": 1554944, "global_step/max_steps": "6074/6362"} +{"lm loss": 4.90133572, "grad_norm": 0.26040804, "learning_rate": 3.54e-06, "elapsed_time_per_iteration": 6.33890581, "memory(GiB)": 21.51, "elapsed_time": "11h 5m 22s", "remaining_time": "31m 26s", "loss_scale": 1.0, "consumed_samples": 1555200, "global_step/max_steps": "6075/6362"} +{"lm loss": 4.85200357, "grad_norm": 0.26870668, "learning_rate": 3.53e-06, "elapsed_time_per_iteration": 6.43809605, "memory(GiB)": 21.51, "elapsed_time": "11h 5m 29s", "remaining_time": "31m 19s", "loss_scale": 1.0, "consumed_samples": 1555456, "global_step/max_steps": "6076/6362"} +{"lm loss": 4.87768221, "grad_norm": 0.26628864, "learning_rate": 3.53e-06, "elapsed_time_per_iteration": 6.3868854, "memory(GiB)": 21.51, "elapsed_time": "11h 5m 35s", "remaining_time": "31m 12s", "loss_scale": 1.0, "consumed_samples": 1555712, "global_step/max_steps": "6077/6362"} +{"lm loss": 4.87191677, "grad_norm": 0.25598365, "learning_rate": 3.53e-06, "elapsed_time_per_iteration": 6.70420957, "memory(GiB)": 21.51, "elapsed_time": "11h 5m 42s", "remaining_time": "31m 6s", "loss_scale": 1.0, "consumed_samples": 1555968, "global_step/max_steps": "6078/6362"} +{"lm loss": 4.88090706, "grad_norm": 0.26632801, "learning_rate": 3.52e-06, "elapsed_time_per_iteration": 6.59143424, "memory(GiB)": 21.51, "elapsed_time": "11h 5m 48s", "remaining_time": "30m 59s", "loss_scale": 1.0, "consumed_samples": 1556224, "global_step/max_steps": "6079/6362"} +{"lm loss": 4.85766125, "grad_norm": 0.26878127, "learning_rate": 3.52e-06, "elapsed_time_per_iteration": 6.41239858, "memory(GiB)": 21.51, "elapsed_time": "11h 5m 55s", "remaining_time": "30m 53s", "loss_scale": 1.0, "consumed_samples": 1556480, "global_step/max_steps": "6080/6362"} +{"lm loss": 4.8609004, "grad_norm": 0.25853154, "learning_rate": 3.52e-06, "elapsed_time_per_iteration": 6.68581247, "memory(GiB)": 21.51, "elapsed_time": "11h 6m 1s", "remaining_time": "30m 46s", "loss_scale": 1.0, "consumed_samples": 1556736, "global_step/max_steps": "6081/6362"} +{"lm loss": 4.8801403, "grad_norm": 0.25624159, "learning_rate": 3.51e-06, "elapsed_time_per_iteration": 6.52374268, "memory(GiB)": 21.51, "elapsed_time": "11h 6m 8s", "remaining_time": "30m 40s", "loss_scale": 1.0, "consumed_samples": 1556992, "global_step/max_steps": "6082/6362"} +{"lm loss": 4.86112547, "grad_norm": 0.26226908, "learning_rate": 3.51e-06, "elapsed_time_per_iteration": 6.52035093, "memory(GiB)": 21.51, "elapsed_time": "11h 6m 14s", "remaining_time": "30m 33s", "loss_scale": 1.0, "consumed_samples": 1557248, "global_step/max_steps": "6083/6362"} +{"lm loss": 4.84078741, "grad_norm": 0.2568523, "learning_rate": 3.51e-06, "elapsed_time_per_iteration": 6.67880774, "memory(GiB)": 21.51, "elapsed_time": "11h 6m 21s", "remaining_time": "30m 26s", "loss_scale": 1.0, "consumed_samples": 1557504, "global_step/max_steps": "6084/6362"} +{"lm loss": 4.87005138, "grad_norm": 0.25486222, "learning_rate": 3.5e-06, "elapsed_time_per_iteration": 6.64766145, "memory(GiB)": 21.51, "elapsed_time": "11h 6m 28s", "remaining_time": "30m 20s", "loss_scale": 1.0, "consumed_samples": 1557760, "global_step/max_steps": "6085/6362"} +{"lm loss": 4.85227489, "grad_norm": 0.26620433, "learning_rate": 3.5e-06, "elapsed_time_per_iteration": 6.67268085, "memory(GiB)": 21.51, "elapsed_time": "11h 6m 34s", "remaining_time": "30m 13s", "loss_scale": 1.0, "consumed_samples": 1558016, "global_step/max_steps": "6086/6362"} +{"lm loss": 4.87745714, "grad_norm": 0.25932744, "learning_rate": 3.49e-06, "elapsed_time_per_iteration": 6.70142579, "memory(GiB)": 21.51, "elapsed_time": "11h 6m 41s", "remaining_time": "30m 7s", "loss_scale": 1.0, "consumed_samples": 1558272, "global_step/max_steps": "6087/6362"} +{"lm loss": 4.87295961, "grad_norm": 0.2635175, "learning_rate": 3.49e-06, "elapsed_time_per_iteration": 6.77359056, "memory(GiB)": 21.51, "elapsed_time": "11h 6m 48s", "remaining_time": "30m 0s", "loss_scale": 1.0, "consumed_samples": 1558528, "global_step/max_steps": "6088/6362"} +{"lm loss": 4.87249851, "grad_norm": 0.25880554, "learning_rate": 3.49e-06, "elapsed_time_per_iteration": 6.53287292, "memory(GiB)": 21.51, "elapsed_time": "11h 6m 54s", "remaining_time": "29m 54s", "loss_scale": 1.0, "consumed_samples": 1558784, "global_step/max_steps": "6089/6362"} +{"lm loss": 4.85847855, "grad_norm": 0.25587615, "learning_rate": 3.48e-06, "elapsed_time_per_iteration": 6.45444679, "memory(GiB)": 21.51, "elapsed_time": "11h 7m 1s", "remaining_time": "29m 47s", "loss_scale": 1.0, "consumed_samples": 1559040, "global_step/max_steps": "6090/6362"} +{"lm loss": 4.86980772, "grad_norm": 0.24264897, "learning_rate": 3.48e-06, "elapsed_time_per_iteration": 7.21257353, "memory(GiB)": 21.51, "elapsed_time": "11h 7m 8s", "remaining_time": "29m 40s", "loss_scale": 1.0, "consumed_samples": 1559296, "global_step/max_steps": "6091/6362"} +{"lm loss": 4.86896133, "grad_norm": 0.25757939, "learning_rate": 3.48e-06, "elapsed_time_per_iteration": 6.69456196, "memory(GiB)": 21.51, "elapsed_time": "11h 7m 15s", "remaining_time": "29m 34s", "loss_scale": 1.0, "consumed_samples": 1559552, "global_step/max_steps": "6092/6362"} +{"lm loss": 4.86378241, "grad_norm": 0.27142337, "learning_rate": 3.47e-06, "elapsed_time_per_iteration": 6.50409961, "memory(GiB)": 21.51, "elapsed_time": "11h 7m 21s", "remaining_time": "29m 27s", "loss_scale": 1.0, "consumed_samples": 1559808, "global_step/max_steps": "6093/6362"} +{"lm loss": 4.86287451, "grad_norm": 0.26212338, "learning_rate": 3.47e-06, "elapsed_time_per_iteration": 6.47085023, "memory(GiB)": 21.51, "elapsed_time": "11h 7m 28s", "remaining_time": "29m 21s", "loss_scale": 1.0, "consumed_samples": 1560064, "global_step/max_steps": "6094/6362"} +{"lm loss": 4.85452223, "grad_norm": 0.25910005, "learning_rate": 3.47e-06, "elapsed_time_per_iteration": 6.43225336, "memory(GiB)": 21.51, "elapsed_time": "11h 7m 34s", "remaining_time": "29m 14s", "loss_scale": 1.0, "consumed_samples": 1560320, "global_step/max_steps": "6095/6362"} +{"lm loss": 4.86689186, "grad_norm": 0.26067269, "learning_rate": 3.46e-06, "elapsed_time_per_iteration": 6.52287197, "memory(GiB)": 21.51, "elapsed_time": "11h 7m 41s", "remaining_time": "29m 8s", "loss_scale": 1.0, "consumed_samples": 1560576, "global_step/max_steps": "6096/6362"} +{"lm loss": 4.86578083, "grad_norm": 0.25959441, "learning_rate": 3.46e-06, "elapsed_time_per_iteration": 6.5945313, "memory(GiB)": 21.51, "elapsed_time": "11h 7m 47s", "remaining_time": "29m 1s", "loss_scale": 1.0, "consumed_samples": 1560832, "global_step/max_steps": "6097/6362"} +{"lm loss": 4.86961126, "grad_norm": 0.25509509, "learning_rate": 3.46e-06, "elapsed_time_per_iteration": 6.78389812, "memory(GiB)": 21.51, "elapsed_time": "11h 7m 54s", "remaining_time": "28m 54s", "loss_scale": 1.0, "consumed_samples": 1561088, "global_step/max_steps": "6098/6362"} +{"lm loss": 4.87666512, "grad_norm": 0.25149658, "learning_rate": 3.45e-06, "elapsed_time_per_iteration": 6.41151524, "memory(GiB)": 21.51, "elapsed_time": "11h 8m 1s", "remaining_time": "28m 48s", "loss_scale": 1.0, "consumed_samples": 1561344, "global_step/max_steps": "6099/6362"} +{"lm loss": 4.8590374, "grad_norm": 0.25938132, "learning_rate": 3.45e-06, "elapsed_time_per_iteration": 6.77022004, "memory(GiB)": 21.51, "elapsed_time": "11h 8m 7s", "remaining_time": "28m 41s", "loss_scale": 1.0, "consumed_samples": 1561600, "global_step/max_steps": "6100/6362"} +{"lm loss": 4.86486864, "grad_norm": 0.2627126, "learning_rate": 3.45e-06, "elapsed_time_per_iteration": 6.65740323, "memory(GiB)": 21.51, "elapsed_time": "11h 8m 14s", "remaining_time": "28m 35s", "loss_scale": 1.0, "consumed_samples": 1561856, "global_step/max_steps": "6101/6362"} +{"lm loss": 4.84890604, "grad_norm": 0.27943364, "learning_rate": 3.44e-06, "elapsed_time_per_iteration": 6.66136694, "memory(GiB)": 21.51, "elapsed_time": "11h 8m 21s", "remaining_time": "28m 28s", "loss_scale": 1.0, "consumed_samples": 1562112, "global_step/max_steps": "6102/6362"} +{"lm loss": 4.85479641, "grad_norm": 0.25475428, "learning_rate": 3.44e-06, "elapsed_time_per_iteration": 6.719522, "memory(GiB)": 21.51, "elapsed_time": "11h 8m 27s", "remaining_time": "28m 22s", "loss_scale": 1.0, "consumed_samples": 1562368, "global_step/max_steps": "6103/6362"} +{"lm loss": 4.87507868, "grad_norm": 0.25358209, "learning_rate": 3.44e-06, "elapsed_time_per_iteration": 6.5667057, "memory(GiB)": 21.51, "elapsed_time": "11h 8m 34s", "remaining_time": "28m 15s", "loss_scale": 1.0, "consumed_samples": 1562624, "global_step/max_steps": "6104/6362"} +{"lm loss": 4.87729073, "grad_norm": 0.25113708, "learning_rate": 3.43e-06, "elapsed_time_per_iteration": 6.52784014, "memory(GiB)": 21.51, "elapsed_time": "11h 8m 40s", "remaining_time": "28m 8s", "loss_scale": 1.0, "consumed_samples": 1562880, "global_step/max_steps": "6105/6362"} +{"lm loss": 4.85410976, "grad_norm": 0.24946336, "learning_rate": 3.43e-06, "elapsed_time_per_iteration": 6.64063597, "memory(GiB)": 21.51, "elapsed_time": "11h 8m 47s", "remaining_time": "28m 2s", "loss_scale": 1.0, "consumed_samples": 1563136, "global_step/max_steps": "6106/6362"} +{"lm loss": 4.85941315, "grad_norm": 0.2535364, "learning_rate": 3.43e-06, "elapsed_time_per_iteration": 6.5250833, "memory(GiB)": 21.51, "elapsed_time": "11h 8m 54s", "remaining_time": "27m 55s", "loss_scale": 1.0, "consumed_samples": 1563392, "global_step/max_steps": "6107/6362"} +{"lm loss": 4.8742528, "grad_norm": 0.26867959, "learning_rate": 3.42e-06, "elapsed_time_per_iteration": 6.55048704, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 0s", "remaining_time": "27m 49s", "loss_scale": 1.0, "consumed_samples": 1563648, "global_step/max_steps": "6108/6362"} +{"lm loss": 4.85999775, "grad_norm": 0.27324608, "learning_rate": 3.42e-06, "elapsed_time_per_iteration": 6.46116948, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 7s", "remaining_time": "27m 42s", "loss_scale": 1.0, "consumed_samples": 1563904, "global_step/max_steps": "6109/6362"} +{"lm loss": 4.86859465, "grad_norm": 0.25269511, "learning_rate": 3.42e-06, "elapsed_time_per_iteration": 6.49871612, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 13s", "remaining_time": "27m 36s", "loss_scale": 1.0, "consumed_samples": 1564160, "global_step/max_steps": "6110/6362"} +{"lm loss": 4.8812623, "grad_norm": 0.25886184, "learning_rate": 3.41e-06, "elapsed_time_per_iteration": 6.49548626, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 20s", "remaining_time": "27m 29s", "loss_scale": 1.0, "consumed_samples": 1564416, "global_step/max_steps": "6111/6362"} +{"lm loss": 4.83678198, "grad_norm": 0.26731279, "learning_rate": 3.41e-06, "elapsed_time_per_iteration": 6.75658822, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 26s", "remaining_time": "27m 22s", "loss_scale": 1.0, "consumed_samples": 1564672, "global_step/max_steps": "6112/6362"} +{"lm loss": 4.86696529, "grad_norm": 0.27015123, "learning_rate": 3.41e-06, "elapsed_time_per_iteration": 6.5747962, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 33s", "remaining_time": "27m 16s", "loss_scale": 1.0, "consumed_samples": 1564928, "global_step/max_steps": "6113/6362"} +{"lm loss": 4.87694931, "grad_norm": 0.25978652, "learning_rate": 3.4e-06, "elapsed_time_per_iteration": 6.44727421, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 39s", "remaining_time": "27m 9s", "loss_scale": 1.0, "consumed_samples": 1565184, "global_step/max_steps": "6114/6362"} +{"lm loss": 4.84994125, "grad_norm": 0.25795993, "learning_rate": 3.4e-06, "elapsed_time_per_iteration": 6.66284633, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 46s", "remaining_time": "27m 3s", "loss_scale": 1.0, "consumed_samples": 1565440, "global_step/max_steps": "6115/6362"} +{"lm loss": 4.8907423, "grad_norm": 0.2536132, "learning_rate": 3.4e-06, "elapsed_time_per_iteration": 6.67786884, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 53s", "remaining_time": "26m 56s", "loss_scale": 1.0, "consumed_samples": 1565696, "global_step/max_steps": "6116/6362"} +{"lm loss": 4.83629084, "grad_norm": 0.27955744, "learning_rate": 3.39e-06, "elapsed_time_per_iteration": 6.73021054, "memory(GiB)": 21.51, "elapsed_time": "11h 9m 59s", "remaining_time": "26m 50s", "loss_scale": 1.0, "consumed_samples": 1565952, "global_step/max_steps": "6117/6362"} +{"lm loss": 4.84845591, "grad_norm": 0.26717591, "learning_rate": 3.39e-06, "elapsed_time_per_iteration": 6.63915014, "memory(GiB)": 21.51, "elapsed_time": "11h 10m 6s", "remaining_time": "26m 43s", "loss_scale": 1.0, "consumed_samples": 1566208, "global_step/max_steps": "6118/6362"} +{"lm loss": 4.85548496, "grad_norm": 0.25250965, "learning_rate": 3.39e-06, "elapsed_time_per_iteration": 6.70114899, "memory(GiB)": 21.51, "elapsed_time": "11h 10m 13s", "remaining_time": "26m 36s", "loss_scale": 1.0, "consumed_samples": 1566464, "global_step/max_steps": "6119/6362"} +{"lm loss": 4.87586975, "grad_norm": 0.26380038, "learning_rate": 3.38e-06, "elapsed_time_per_iteration": 6.58900905, "memory(GiB)": 21.51, "elapsed_time": "11h 10m 19s", "remaining_time": "26m 30s", "loss_scale": 1.0, "consumed_samples": 1566720, "global_step/max_steps": "6120/6362"} +{"lm loss": 4.85416365, "grad_norm": 0.27162081, "learning_rate": 3.38e-06, "elapsed_time_per_iteration": 6.69876814, "memory(GiB)": 21.51, "elapsed_time": "11h 10m 26s", "remaining_time": "26m 23s", "loss_scale": 1.0, "consumed_samples": 1566976, "global_step/max_steps": "6121/6362"} +{"lm loss": 4.84537888, "grad_norm": 0.25743258, "learning_rate": 3.38e-06, "elapsed_time_per_iteration": 6.4392159, "memory(GiB)": 21.51, "elapsed_time": "11h 10m 33s", "remaining_time": "26m 17s", "loss_scale": 1.0, "consumed_samples": 1567232, "global_step/max_steps": "6122/6362"} +{"lm loss": 4.89145613, "grad_norm": 0.27717608, "learning_rate": 3.37e-06, "elapsed_time_per_iteration": 6.47676206, "memory(GiB)": 21.51, "elapsed_time": "11h 10m 39s", "remaining_time": "26m 10s", "loss_scale": 1.0, "consumed_samples": 1567488, "global_step/max_steps": "6123/6362"} +{"lm loss": 4.86228991, "grad_norm": 0.268381, "learning_rate": 3.37e-06, "elapsed_time_per_iteration": 6.60211205, "memory(GiB)": 21.51, "elapsed_time": "11h 10m 46s", "remaining_time": "26m 4s", "loss_scale": 1.0, "consumed_samples": 1567744, "global_step/max_steps": "6124/6362"} +{"lm loss": 4.87310553, "grad_norm": 0.25865209, "learning_rate": 3.37e-06, "elapsed_time_per_iteration": 6.60497046, "memory(GiB)": 21.51, "elapsed_time": "11h 10m 52s", "remaining_time": "25m 57s", "loss_scale": 1.0, "consumed_samples": 1568000, "global_step/max_steps": "6125/6362"} +{"lm loss": 4.8617692, "grad_norm": 0.2630575, "learning_rate": 3.36e-06, "elapsed_time_per_iteration": 6.81097579, "memory(GiB)": 21.51, "elapsed_time": "11h 10m 59s", "remaining_time": "25m 50s", "loss_scale": 1.0, "consumed_samples": 1568256, "global_step/max_steps": "6126/6362"} +{"lm loss": 4.86616039, "grad_norm": 0.27390438, "learning_rate": 3.36e-06, "elapsed_time_per_iteration": 6.83264995, "memory(GiB)": 21.51, "elapsed_time": "11h 11m 6s", "remaining_time": "25m 44s", "loss_scale": 1.0, "consumed_samples": 1568512, "global_step/max_steps": "6127/6362"} +{"lm loss": 4.87661839, "grad_norm": 0.26048046, "learning_rate": 3.36e-06, "elapsed_time_per_iteration": 6.46395993, "memory(GiB)": 21.51, "elapsed_time": "11h 11m 12s", "remaining_time": "25m 37s", "loss_scale": 1.0, "consumed_samples": 1568768, "global_step/max_steps": "6128/6362"} +{"lm loss": 4.87415981, "grad_norm": 0.2562238, "learning_rate": 3.36e-06, "elapsed_time_per_iteration": 6.39041948, "memory(GiB)": 21.51, "elapsed_time": "11h 11m 19s", "remaining_time": "25m 31s", "loss_scale": 1.0, "consumed_samples": 1569024, "global_step/max_steps": "6129/6362"} +{"lm loss": 4.85077953, "grad_norm": 0.26876032, "learning_rate": 3.35e-06, "elapsed_time_per_iteration": 6.91070366, "memory(GiB)": 21.51, "elapsed_time": "11h 11m 26s", "remaining_time": "25m 24s", "loss_scale": 1.0, "consumed_samples": 1569280, "global_step/max_steps": "6130/6362"} +{"lm loss": 4.85339355, "grad_norm": 0.26549464, "learning_rate": 3.35e-06, "elapsed_time_per_iteration": 6.77662754, "memory(GiB)": 21.51, "elapsed_time": "11h 11m 32s", "remaining_time": "25m 18s", "loss_scale": 1.0, "consumed_samples": 1569536, "global_step/max_steps": "6131/6362"} +{"lm loss": 4.89095926, "grad_norm": 0.24865893, "learning_rate": 3.35e-06, "elapsed_time_per_iteration": 6.78424931, "memory(GiB)": 21.51, "elapsed_time": "11h 11m 39s", "remaining_time": "25m 11s", "loss_scale": 1.0, "consumed_samples": 1569792, "global_step/max_steps": "6132/6362"} +{"lm loss": 4.86484003, "grad_norm": 0.25824484, "learning_rate": 3.34e-06, "elapsed_time_per_iteration": 6.58938169, "memory(GiB)": 21.51, "elapsed_time": "11h 11m 46s", "remaining_time": "25m 4s", "loss_scale": 1.0, "consumed_samples": 1570048, "global_step/max_steps": "6133/6362"} +{"lm loss": 4.84542418, "grad_norm": 0.25058571, "learning_rate": 3.34e-06, "elapsed_time_per_iteration": 6.70790982, "memory(GiB)": 21.51, "elapsed_time": "11h 11m 53s", "remaining_time": "24m 58s", "loss_scale": 1.0, "consumed_samples": 1570304, "global_step/max_steps": "6134/6362"} +{"lm loss": 4.85824394, "grad_norm": 0.25606379, "learning_rate": 3.34e-06, "elapsed_time_per_iteration": 6.70456243, "memory(GiB)": 21.51, "elapsed_time": "11h 11m 59s", "remaining_time": "24m 51s", "loss_scale": 1.0, "consumed_samples": 1570560, "global_step/max_steps": "6135/6362"} +{"lm loss": 4.84891605, "grad_norm": 0.25812718, "learning_rate": 3.33e-06, "elapsed_time_per_iteration": 6.87995958, "memory(GiB)": 21.51, "elapsed_time": "11h 12m 6s", "remaining_time": "24m 45s", "loss_scale": 1.0, "consumed_samples": 1570816, "global_step/max_steps": "6136/6362"} +{"lm loss": 4.86041069, "grad_norm": 0.25621086, "learning_rate": 3.33e-06, "elapsed_time_per_iteration": 6.49530602, "memory(GiB)": 21.51, "elapsed_time": "11h 12m 13s", "remaining_time": "24m 38s", "loss_scale": 1.0, "consumed_samples": 1571072, "global_step/max_steps": "6137/6362"} +{"lm loss": 4.87944698, "grad_norm": 0.26289681, "learning_rate": 3.33e-06, "elapsed_time_per_iteration": 6.55850434, "memory(GiB)": 21.51, "elapsed_time": "11h 12m 19s", "remaining_time": "24m 32s", "loss_scale": 1.0, "consumed_samples": 1571328, "global_step/max_steps": "6138/6362"} +{"lm loss": 4.83104801, "grad_norm": 0.25116137, "learning_rate": 3.33e-06, "elapsed_time_per_iteration": 6.45252419, "memory(GiB)": 21.51, "elapsed_time": "11h 12m 26s", "remaining_time": "24m 25s", "loss_scale": 1.0, "consumed_samples": 1571584, "global_step/max_steps": "6139/6362"} +{"lm loss": 4.84520769, "grad_norm": 0.25855041, "learning_rate": 3.32e-06, "elapsed_time_per_iteration": 6.49117637, "memory(GiB)": 21.51, "elapsed_time": "11h 12m 32s", "remaining_time": "24m 19s", "loss_scale": 1.0, "consumed_samples": 1571840, "global_step/max_steps": "6140/6362"} +{"lm loss": 4.85889101, "grad_norm": 0.25215566, "learning_rate": 3.32e-06, "elapsed_time_per_iteration": 6.44565439, "memory(GiB)": 21.51, "elapsed_time": "11h 12m 39s", "remaining_time": "24m 12s", "loss_scale": 1.0, "consumed_samples": 1572096, "global_step/max_steps": "6141/6362"} +{"lm loss": 4.8469696, "grad_norm": 0.25232089, "learning_rate": 3.32e-06, "elapsed_time_per_iteration": 6.73598742, "memory(GiB)": 21.51, "elapsed_time": "11h 12m 45s", "remaining_time": "24m 5s", "loss_scale": 1.0, "consumed_samples": 1572352, "global_step/max_steps": "6142/6362"} +{"lm loss": 4.85386515, "grad_norm": 0.26209274, "learning_rate": 3.31e-06, "elapsed_time_per_iteration": 6.46243382, "memory(GiB)": 21.51, "elapsed_time": "11h 12m 52s", "remaining_time": "23m 59s", "loss_scale": 1.0, "consumed_samples": 1572608, "global_step/max_steps": "6143/6362"} +{"lm loss": 4.88549376, "grad_norm": 0.26957363, "learning_rate": 3.31e-06, "elapsed_time_per_iteration": 6.4052062, "memory(GiB)": 21.51, "elapsed_time": "11h 12m 58s", "remaining_time": "23m 52s", "loss_scale": 1.0, "consumed_samples": 1572864, "global_step/max_steps": "6144/6362"} +{"lm loss": 4.87341213, "grad_norm": 0.25399908, "learning_rate": 3.31e-06, "elapsed_time_per_iteration": 6.64259028, "memory(GiB)": 21.51, "elapsed_time": "11h 13m 5s", "remaining_time": "23m 46s", "loss_scale": 1.0, "consumed_samples": 1573120, "global_step/max_steps": "6145/6362"} +{"lm loss": 4.85869455, "grad_norm": 0.25994685, "learning_rate": 3.31e-06, "elapsed_time_per_iteration": 6.63779783, "memory(GiB)": 21.51, "elapsed_time": "11h 13m 11s", "remaining_time": "23m 39s", "loss_scale": 1.0, "consumed_samples": 1573376, "global_step/max_steps": "6146/6362"} +{"lm loss": 4.88120842, "grad_norm": 0.26252323, "learning_rate": 3.3e-06, "elapsed_time_per_iteration": 6.80099559, "memory(GiB)": 21.51, "elapsed_time": "11h 13m 18s", "remaining_time": "23m 33s", "loss_scale": 1.0, "consumed_samples": 1573632, "global_step/max_steps": "6147/6362"} +{"lm loss": 4.87288666, "grad_norm": 0.25605386, "learning_rate": 3.3e-06, "elapsed_time_per_iteration": 6.54855537, "memory(GiB)": 21.51, "elapsed_time": "11h 13m 25s", "remaining_time": "23m 26s", "loss_scale": 1.0, "consumed_samples": 1573888, "global_step/max_steps": "6148/6362"} +{"lm loss": 4.89766502, "grad_norm": 0.26447105, "learning_rate": 3.3e-06, "elapsed_time_per_iteration": 6.66871715, "memory(GiB)": 21.51, "elapsed_time": "11h 13m 31s", "remaining_time": "23m 19s", "loss_scale": 1.0, "consumed_samples": 1574144, "global_step/max_steps": "6149/6362"} +{"lm loss": 4.88468742, "grad_norm": 0.25564566, "learning_rate": 3.29e-06, "elapsed_time_per_iteration": 6.42239189, "memory(GiB)": 21.51, "elapsed_time": "11h 13m 38s", "remaining_time": "23m 13s", "loss_scale": 1.0, "consumed_samples": 1574400, "global_step/max_steps": "6150/6362"} +{"lm loss": 4.86884022, "grad_norm": 0.27686653, "learning_rate": 3.29e-06, "elapsed_time_per_iteration": 6.39042664, "memory(GiB)": 21.51, "elapsed_time": "11h 13m 44s", "remaining_time": "23m 6s", "loss_scale": 1.0, "consumed_samples": 1574656, "global_step/max_steps": "6151/6362"} +{"lm loss": 4.85947561, "grad_norm": 0.26088223, "learning_rate": 3.29e-06, "elapsed_time_per_iteration": 6.62127924, "memory(GiB)": 21.51, "elapsed_time": "11h 13m 51s", "remaining_time": "23m 0s", "loss_scale": 1.0, "consumed_samples": 1574912, "global_step/max_steps": "6152/6362"} +{"lm loss": 4.88088703, "grad_norm": 0.27459368, "learning_rate": 3.29e-06, "elapsed_time_per_iteration": 6.59311223, "memory(GiB)": 21.51, "elapsed_time": "11h 13m 57s", "remaining_time": "22m 53s", "loss_scale": 1.0, "consumed_samples": 1575168, "global_step/max_steps": "6153/6362"} +{"lm loss": 4.8596735, "grad_norm": 0.2535564, "learning_rate": 3.28e-06, "elapsed_time_per_iteration": 6.41648817, "memory(GiB)": 21.51, "elapsed_time": "11h 14m 4s", "remaining_time": "22m 46s", "loss_scale": 1.0, "consumed_samples": 1575424, "global_step/max_steps": "6154/6362"} +{"lm loss": 4.86738729, "grad_norm": 0.25496975, "learning_rate": 3.28e-06, "elapsed_time_per_iteration": 6.48516846, "memory(GiB)": 21.51, "elapsed_time": "11h 14m 10s", "remaining_time": "22m 40s", "loss_scale": 1.0, "consumed_samples": 1575680, "global_step/max_steps": "6155/6362"} +{"lm loss": 4.86559343, "grad_norm": 0.26705968, "learning_rate": 3.28e-06, "elapsed_time_per_iteration": 6.60741043, "memory(GiB)": 21.51, "elapsed_time": "11h 14m 17s", "remaining_time": "22m 33s", "loss_scale": 1.0, "consumed_samples": 1575936, "global_step/max_steps": "6156/6362"} +{"lm loss": 4.87954044, "grad_norm": 0.26279306, "learning_rate": 3.28e-06, "elapsed_time_per_iteration": 6.7234571, "memory(GiB)": 21.51, "elapsed_time": "11h 14m 24s", "remaining_time": "22m 27s", "loss_scale": 1.0, "consumed_samples": 1576192, "global_step/max_steps": "6157/6362"} +{"lm loss": 4.85293674, "grad_norm": 0.26559651, "learning_rate": 3.27e-06, "elapsed_time_per_iteration": 6.54195881, "memory(GiB)": 21.51, "elapsed_time": "11h 14m 30s", "remaining_time": "22m 20s", "loss_scale": 1.0, "consumed_samples": 1576448, "global_step/max_steps": "6158/6362"} +{"lm loss": 4.87859869, "grad_norm": 0.25579363, "learning_rate": 3.27e-06, "elapsed_time_per_iteration": 6.42122054, "memory(GiB)": 21.51, "elapsed_time": "11h 14m 37s", "remaining_time": "22m 14s", "loss_scale": 1.0, "consumed_samples": 1576704, "global_step/max_steps": "6159/6362"} +{"lm loss": 4.85667133, "grad_norm": 0.26713139, "learning_rate": 3.27e-06, "elapsed_time_per_iteration": 6.61405993, "memory(GiB)": 21.51, "elapsed_time": "11h 14m 43s", "remaining_time": "22m 7s", "loss_scale": 1.0, "consumed_samples": 1576960, "global_step/max_steps": "6160/6362"} +{"lm loss": 4.86720896, "grad_norm": 0.25553674, "learning_rate": 3.26e-06, "elapsed_time_per_iteration": 6.38404298, "memory(GiB)": 21.51, "elapsed_time": "11h 14m 50s", "remaining_time": "22m 0s", "loss_scale": 1.0, "consumed_samples": 1577216, "global_step/max_steps": "6161/6362"} +{"lm loss": 4.87396193, "grad_norm": 0.26601982, "learning_rate": 3.26e-06, "elapsed_time_per_iteration": 6.67076159, "memory(GiB)": 21.51, "elapsed_time": "11h 14m 56s", "remaining_time": "21m 54s", "loss_scale": 1.0, "consumed_samples": 1577472, "global_step/max_steps": "6162/6362"} +{"lm loss": 4.88066339, "grad_norm": 0.26717398, "learning_rate": 3.26e-06, "elapsed_time_per_iteration": 6.40488815, "memory(GiB)": 21.51, "elapsed_time": "11h 15m 3s", "remaining_time": "21m 47s", "loss_scale": 1.0, "consumed_samples": 1577728, "global_step/max_steps": "6163/6362"} +{"lm loss": 4.85193491, "grad_norm": 0.24619022, "learning_rate": 3.26e-06, "elapsed_time_per_iteration": 6.49134851, "memory(GiB)": 21.51, "elapsed_time": "11h 15m 9s", "remaining_time": "21m 41s", "loss_scale": 1.0, "consumed_samples": 1577984, "global_step/max_steps": "6164/6362"} +{"lm loss": 4.85666847, "grad_norm": 0.26157063, "learning_rate": 3.25e-06, "elapsed_time_per_iteration": 6.39220691, "memory(GiB)": 21.51, "elapsed_time": "11h 15m 16s", "remaining_time": "21m 34s", "loss_scale": 1.0, "consumed_samples": 1578240, "global_step/max_steps": "6165/6362"} +{"lm loss": 4.85105515, "grad_norm": 0.2520875, "learning_rate": 3.25e-06, "elapsed_time_per_iteration": 6.53470302, "memory(GiB)": 21.51, "elapsed_time": "11h 15m 22s", "remaining_time": "21m 28s", "loss_scale": 1.0, "consumed_samples": 1578496, "global_step/max_steps": "6166/6362"} +{"lm loss": 4.86760139, "grad_norm": 0.25710508, "learning_rate": 3.25e-06, "elapsed_time_per_iteration": 6.46881509, "memory(GiB)": 21.51, "elapsed_time": "11h 15m 29s", "remaining_time": "21m 21s", "loss_scale": 1.0, "consumed_samples": 1578752, "global_step/max_steps": "6167/6362"} +{"lm loss": 4.86970758, "grad_norm": 0.25892535, "learning_rate": 3.25e-06, "elapsed_time_per_iteration": 6.65386987, "memory(GiB)": 21.51, "elapsed_time": "11h 15m 35s", "remaining_time": "21m 14s", "loss_scale": 1.0, "consumed_samples": 1579008, "global_step/max_steps": "6168/6362"} +{"lm loss": 4.87656879, "grad_norm": 0.26115638, "learning_rate": 3.24e-06, "elapsed_time_per_iteration": 6.36281323, "memory(GiB)": 21.51, "elapsed_time": "11h 15m 42s", "remaining_time": "21m 8s", "loss_scale": 1.0, "consumed_samples": 1579264, "global_step/max_steps": "6169/6362"} +{"lm loss": 4.90024328, "grad_norm": 0.25117767, "learning_rate": 3.24e-06, "elapsed_time_per_iteration": 6.37565613, "memory(GiB)": 21.51, "elapsed_time": "11h 15m 48s", "remaining_time": "21m 1s", "loss_scale": 1.0, "consumed_samples": 1579520, "global_step/max_steps": "6170/6362"} +{"lm loss": 4.82528782, "grad_norm": 0.27067697, "learning_rate": 3.24e-06, "elapsed_time_per_iteration": 6.56452966, "memory(GiB)": 21.51, "elapsed_time": "11h 15m 55s", "remaining_time": "20m 55s", "loss_scale": 1.0, "consumed_samples": 1579776, "global_step/max_steps": "6171/6362"} +{"lm loss": 4.86735106, "grad_norm": 0.25248182, "learning_rate": 3.24e-06, "elapsed_time_per_iteration": 6.52184796, "memory(GiB)": 21.51, "elapsed_time": "11h 16m 1s", "remaining_time": "20m 48s", "loss_scale": 1.0, "consumed_samples": 1580032, "global_step/max_steps": "6172/6362"} +{"lm loss": 4.8823266, "grad_norm": 0.25569749, "learning_rate": 3.23e-06, "elapsed_time_per_iteration": 6.42083502, "memory(GiB)": 21.51, "elapsed_time": "11h 16m 8s", "remaining_time": "20m 42s", "loss_scale": 1.0, "consumed_samples": 1580288, "global_step/max_steps": "6173/6362"} +{"lm loss": 4.87074375, "grad_norm": 0.25739866, "learning_rate": 3.23e-06, "elapsed_time_per_iteration": 6.41804743, "memory(GiB)": 21.51, "elapsed_time": "11h 16m 14s", "remaining_time": "20m 35s", "loss_scale": 1.0, "consumed_samples": 1580544, "global_step/max_steps": "6174/6362"} +{"lm loss": 4.90654278, "grad_norm": 0.26435053, "learning_rate": 3.23e-06, "elapsed_time_per_iteration": 6.49576998, "memory(GiB)": 21.51, "elapsed_time": "11h 16m 20s", "remaining_time": "20m 28s", "loss_scale": 1.0, "consumed_samples": 1580800, "global_step/max_steps": "6175/6362"} +{"lm loss": 4.84209776, "grad_norm": 0.25603554, "learning_rate": 3.23e-06, "elapsed_time_per_iteration": 6.56434131, "memory(GiB)": 21.51, "elapsed_time": "11h 16m 27s", "remaining_time": "20m 22s", "loss_scale": 1.0, "consumed_samples": 1581056, "global_step/max_steps": "6176/6362"} +{"lm loss": 4.87055779, "grad_norm": 0.25156078, "learning_rate": 3.22e-06, "elapsed_time_per_iteration": 6.49440336, "memory(GiB)": 21.51, "elapsed_time": "11h 16m 33s", "remaining_time": "20m 15s", "loss_scale": 1.0, "consumed_samples": 1581312, "global_step/max_steps": "6177/6362"} +{"lm loss": 4.86714602, "grad_norm": 0.26675683, "learning_rate": 3.22e-06, "elapsed_time_per_iteration": 6.50882888, "memory(GiB)": 21.51, "elapsed_time": "11h 16m 40s", "remaining_time": "20m 9s", "loss_scale": 1.0, "consumed_samples": 1581568, "global_step/max_steps": "6178/6362"} +{"lm loss": 4.87467623, "grad_norm": 0.2555908, "learning_rate": 3.22e-06, "elapsed_time_per_iteration": 6.6270225, "memory(GiB)": 21.51, "elapsed_time": "11h 16m 47s", "remaining_time": "20m 2s", "loss_scale": 1.0, "consumed_samples": 1581824, "global_step/max_steps": "6179/6362"} +{"lm loss": 4.87692642, "grad_norm": 0.26104409, "learning_rate": 3.22e-06, "elapsed_time_per_iteration": 6.95443106, "memory(GiB)": 21.51, "elapsed_time": "11h 16m 54s", "remaining_time": "19m 56s", "loss_scale": 1.0, "consumed_samples": 1582080, "global_step/max_steps": "6180/6362"} +{"lm loss": 4.87592411, "grad_norm": 0.25067967, "learning_rate": 3.21e-06, "elapsed_time_per_iteration": 6.72746038, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 0s", "remaining_time": "19m 49s", "loss_scale": 1.0, "consumed_samples": 1582336, "global_step/max_steps": "6181/6362"} +{"lm loss": 4.86423635, "grad_norm": 0.2445164, "learning_rate": 3.21e-06, "elapsed_time_per_iteration": 6.78809261, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 7s", "remaining_time": "19m 42s", "loss_scale": 1.0, "consumed_samples": 1582592, "global_step/max_steps": "6182/6362"} +{"lm loss": 4.8677597, "grad_norm": 0.25515798, "learning_rate": 3.21e-06, "elapsed_time_per_iteration": 6.50631785, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 14s", "remaining_time": "19m 36s", "loss_scale": 1.0, "consumed_samples": 1582848, "global_step/max_steps": "6183/6362"} +{"lm loss": 4.871171, "grad_norm": 0.25152227, "learning_rate": 3.21e-06, "elapsed_time_per_iteration": 6.59482217, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 20s", "remaining_time": "19m 29s", "loss_scale": 1.0, "consumed_samples": 1583104, "global_step/max_steps": "6184/6362"} +{"lm loss": 4.87307787, "grad_norm": 0.25845799, "learning_rate": 3.21e-06, "elapsed_time_per_iteration": 6.7396946, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 27s", "remaining_time": "19m 23s", "loss_scale": 1.0, "consumed_samples": 1583360, "global_step/max_steps": "6185/6362"} +{"lm loss": 4.85852909, "grad_norm": 0.25386873, "learning_rate": 3.2e-06, "elapsed_time_per_iteration": 6.49700499, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 33s", "remaining_time": "19m 16s", "loss_scale": 1.0, "consumed_samples": 1583616, "global_step/max_steps": "6186/6362"} +{"lm loss": 4.89493465, "grad_norm": 0.24914807, "learning_rate": 3.2e-06, "elapsed_time_per_iteration": 6.50519776, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 40s", "remaining_time": "19m 10s", "loss_scale": 1.0, "consumed_samples": 1583872, "global_step/max_steps": "6187/6362"} +{"lm loss": 4.87064934, "grad_norm": 0.26092994, "learning_rate": 3.2e-06, "elapsed_time_per_iteration": 6.31778955, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 46s", "remaining_time": "19m 3s", "loss_scale": 1.0, "consumed_samples": 1584128, "global_step/max_steps": "6188/6362"} +{"lm loss": 4.89088058, "grad_norm": 0.26429954, "learning_rate": 3.2e-06, "elapsed_time_per_iteration": 6.48702002, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 53s", "remaining_time": "18m 56s", "loss_scale": 1.0, "consumed_samples": 1584384, "global_step/max_steps": "6189/6362"} +{"lm loss": 4.84543562, "grad_norm": 0.25680056, "learning_rate": 3.19e-06, "elapsed_time_per_iteration": 6.69582081, "memory(GiB)": 21.51, "elapsed_time": "11h 17m 59s", "remaining_time": "18m 50s", "loss_scale": 1.0, "consumed_samples": 1584640, "global_step/max_steps": "6190/6362"} +{"lm loss": 4.86280394, "grad_norm": 0.24662948, "learning_rate": 3.19e-06, "elapsed_time_per_iteration": 6.6935184, "memory(GiB)": 21.51, "elapsed_time": "11h 18m 6s", "remaining_time": "18m 43s", "loss_scale": 1.0, "consumed_samples": 1584896, "global_step/max_steps": "6191/6362"} +{"lm loss": 4.87682343, "grad_norm": 0.25940776, "learning_rate": 3.19e-06, "elapsed_time_per_iteration": 6.68989635, "memory(GiB)": 21.51, "elapsed_time": "11h 18m 13s", "remaining_time": "18m 37s", "loss_scale": 1.0, "consumed_samples": 1585152, "global_step/max_steps": "6192/6362"} +{"lm loss": 4.85867071, "grad_norm": 0.25492424, "learning_rate": 3.19e-06, "elapsed_time_per_iteration": 6.56052637, "memory(GiB)": 21.51, "elapsed_time": "11h 18m 19s", "remaining_time": "18m 30s", "loss_scale": 1.0, "consumed_samples": 1585408, "global_step/max_steps": "6193/6362"} +{"lm loss": 4.87837172, "grad_norm": 0.25584045, "learning_rate": 3.18e-06, "elapsed_time_per_iteration": 6.58109713, "memory(GiB)": 21.51, "elapsed_time": "11h 18m 26s", "remaining_time": "18m 24s", "loss_scale": 1.0, "consumed_samples": 1585664, "global_step/max_steps": "6194/6362"} +{"lm loss": 4.87532616, "grad_norm": 0.26206681, "learning_rate": 3.18e-06, "elapsed_time_per_iteration": 6.69282556, "memory(GiB)": 21.51, "elapsed_time": "11h 18m 33s", "remaining_time": "18m 17s", "loss_scale": 1.0, "consumed_samples": 1585920, "global_step/max_steps": "6195/6362"} +{"lm loss": 4.8646965, "grad_norm": 0.25204223, "learning_rate": 3.18e-06, "elapsed_time_per_iteration": 6.63592601, "memory(GiB)": 21.51, "elapsed_time": "11h 18m 39s", "remaining_time": "18m 10s", "loss_scale": 1.0, "consumed_samples": 1586176, "global_step/max_steps": "6196/6362"} +{"lm loss": 4.85590696, "grad_norm": 0.26626056, "learning_rate": 3.18e-06, "elapsed_time_per_iteration": 6.55383563, "memory(GiB)": 21.51, "elapsed_time": "11h 18m 46s", "remaining_time": "18m 4s", "loss_scale": 1.0, "consumed_samples": 1586432, "global_step/max_steps": "6197/6362"} +{"lm loss": 4.86088991, "grad_norm": 0.2736944, "learning_rate": 3.18e-06, "elapsed_time_per_iteration": 6.58160734, "memory(GiB)": 21.51, "elapsed_time": "11h 18m 52s", "remaining_time": "17m 57s", "loss_scale": 1.0, "consumed_samples": 1586688, "global_step/max_steps": "6198/6362"} +{"lm loss": 4.85815525, "grad_norm": 0.26003516, "learning_rate": 3.17e-06, "elapsed_time_per_iteration": 6.42598271, "memory(GiB)": 21.51, "elapsed_time": "11h 18m 59s", "remaining_time": "17m 51s", "loss_scale": 1.0, "consumed_samples": 1586944, "global_step/max_steps": "6199/6362"} +{"lm loss": 4.85734606, "grad_norm": 0.26296458, "learning_rate": 3.17e-06, "elapsed_time_per_iteration": 6.4162004, "memory(GiB)": 21.51, "elapsed_time": "11h 19m 5s", "remaining_time": "17m 44s", "loss_scale": 1.0, "consumed_samples": 1587200, "global_step/max_steps": "6200/6362"} +{"lm loss": 4.87615108, "grad_norm": 0.27892479, "learning_rate": 3.17e-06, "elapsed_time_per_iteration": 6.48692417, "memory(GiB)": 21.51, "elapsed_time": "11h 19m 12s", "remaining_time": "17m 38s", "loss_scale": 1.0, "consumed_samples": 1587456, "global_step/max_steps": "6201/6362"} +{"lm loss": 4.87389326, "grad_norm": 0.26284483, "learning_rate": 3.17e-06, "elapsed_time_per_iteration": 6.73290753, "memory(GiB)": 21.51, "elapsed_time": "11h 19m 18s", "remaining_time": "17m 31s", "loss_scale": 1.0, "consumed_samples": 1587712, "global_step/max_steps": "6202/6362"} +{"lm loss": 4.88336658, "grad_norm": 0.26645797, "learning_rate": 3.17e-06, "elapsed_time_per_iteration": 6.76488352, "memory(GiB)": 21.51, "elapsed_time": "11h 19m 25s", "remaining_time": "17m 24s", "loss_scale": 1.0, "consumed_samples": 1587968, "global_step/max_steps": "6203/6362"} +{"lm loss": 4.85805511, "grad_norm": 0.27041024, "learning_rate": 3.16e-06, "elapsed_time_per_iteration": 6.61059642, "memory(GiB)": 21.51, "elapsed_time": "11h 19m 32s", "remaining_time": "17m 18s", "loss_scale": 1.0, "consumed_samples": 1588224, "global_step/max_steps": "6204/6362"} +{"lm loss": 4.85829973, "grad_norm": 0.25830483, "learning_rate": 3.16e-06, "elapsed_time_per_iteration": 6.5330801, "memory(GiB)": 21.51, "elapsed_time": "11h 19m 38s", "remaining_time": "17m 11s", "loss_scale": 1.0, "consumed_samples": 1588480, "global_step/max_steps": "6205/6362"} +{"lm loss": 4.8565712, "grad_norm": 0.25168255, "learning_rate": 3.16e-06, "elapsed_time_per_iteration": 6.46965361, "memory(GiB)": 21.51, "elapsed_time": "11h 19m 45s", "remaining_time": "17m 5s", "loss_scale": 1.0, "consumed_samples": 1588736, "global_step/max_steps": "6206/6362"} +{"lm loss": 4.85799074, "grad_norm": 0.26602724, "learning_rate": 3.16e-06, "elapsed_time_per_iteration": 6.66081476, "memory(GiB)": 21.51, "elapsed_time": "11h 19m 52s", "remaining_time": "16m 58s", "loss_scale": 1.0, "consumed_samples": 1588992, "global_step/max_steps": "6207/6362"} +{"lm loss": 4.86194706, "grad_norm": 0.25387919, "learning_rate": 3.16e-06, "elapsed_time_per_iteration": 6.62934828, "memory(GiB)": 21.51, "elapsed_time": "11h 19m 58s", "remaining_time": "16m 52s", "loss_scale": 1.0, "consumed_samples": 1589248, "global_step/max_steps": "6208/6362"} +{"lm loss": 4.86516142, "grad_norm": 0.2657795, "learning_rate": 3.15e-06, "elapsed_time_per_iteration": 6.58364534, "memory(GiB)": 21.51, "elapsed_time": "11h 20m 5s", "remaining_time": "16m 45s", "loss_scale": 1.0, "consumed_samples": 1589504, "global_step/max_steps": "6209/6362"} +{"lm loss": 4.85718203, "grad_norm": 0.27440768, "learning_rate": 3.15e-06, "elapsed_time_per_iteration": 6.69172215, "memory(GiB)": 21.51, "elapsed_time": "11h 20m 11s", "remaining_time": "16m 38s", "loss_scale": 1.0, "consumed_samples": 1589760, "global_step/max_steps": "6210/6362"} +{"lm loss": 4.86955786, "grad_norm": 0.24676938, "learning_rate": 3.15e-06, "elapsed_time_per_iteration": 6.65756989, "memory(GiB)": 21.51, "elapsed_time": "11h 20m 18s", "remaining_time": "16m 32s", "loss_scale": 1.0, "consumed_samples": 1590016, "global_step/max_steps": "6211/6362"} +{"lm loss": 4.90386343, "grad_norm": 0.25537962, "learning_rate": 3.15e-06, "elapsed_time_per_iteration": 6.81243849, "memory(GiB)": 21.51, "elapsed_time": "11h 20m 25s", "remaining_time": "16m 25s", "loss_scale": 1.0, "consumed_samples": 1590272, "global_step/max_steps": "6212/6362"} +{"lm loss": 4.86209059, "grad_norm": 0.2460829, "learning_rate": 3.15e-06, "elapsed_time_per_iteration": 6.64020038, "memory(GiB)": 21.51, "elapsed_time": "11h 20m 32s", "remaining_time": "16m 19s", "loss_scale": 1.0, "consumed_samples": 1590528, "global_step/max_steps": "6213/6362"} +{"lm loss": 4.90216875, "grad_norm": 0.26932493, "learning_rate": 3.14e-06, "elapsed_time_per_iteration": 6.63993669, "memory(GiB)": 21.51, "elapsed_time": "11h 20m 38s", "remaining_time": "16m 12s", "loss_scale": 1.0, "consumed_samples": 1590784, "global_step/max_steps": "6214/6362"} +{"lm loss": 4.86529636, "grad_norm": 0.24854903, "learning_rate": 3.14e-06, "elapsed_time_per_iteration": 6.48725867, "memory(GiB)": 21.51, "elapsed_time": "11h 20m 45s", "remaining_time": "16m 6s", "loss_scale": 1.0, "consumed_samples": 1591040, "global_step/max_steps": "6215/6362"} +{"lm loss": 4.87322044, "grad_norm": 0.26518619, "learning_rate": 3.14e-06, "elapsed_time_per_iteration": 6.65975022, "memory(GiB)": 21.51, "elapsed_time": "11h 20m 51s", "remaining_time": "15m 59s", "loss_scale": 1.0, "consumed_samples": 1591296, "global_step/max_steps": "6216/6362"} +{"lm loss": 4.85131121, "grad_norm": 0.24518923, "learning_rate": 3.14e-06, "elapsed_time_per_iteration": 6.77420282, "memory(GiB)": 21.51, "elapsed_time": "11h 20m 58s", "remaining_time": "15m 52s", "loss_scale": 1.0, "consumed_samples": 1591552, "global_step/max_steps": "6217/6362"} +{"lm loss": 4.87478733, "grad_norm": 0.2506783, "learning_rate": 3.14e-06, "elapsed_time_per_iteration": 6.58902907, "memory(GiB)": 21.51, "elapsed_time": "11h 21m 5s", "remaining_time": "15m 46s", "loss_scale": 1.0, "consumed_samples": 1591808, "global_step/max_steps": "6218/6362"} +{"lm loss": 4.84642172, "grad_norm": 0.24648172, "learning_rate": 3.13e-06, "elapsed_time_per_iteration": 6.54713392, "memory(GiB)": 21.51, "elapsed_time": "11h 21m 11s", "remaining_time": "15m 39s", "loss_scale": 1.0, "consumed_samples": 1592064, "global_step/max_steps": "6219/6362"} +{"lm loss": 4.8537302, "grad_norm": 0.25104335, "learning_rate": 3.13e-06, "elapsed_time_per_iteration": 6.51570415, "memory(GiB)": 21.51, "elapsed_time": "11h 21m 18s", "remaining_time": "15m 33s", "loss_scale": 1.0, "consumed_samples": 1592320, "global_step/max_steps": "6220/6362"} +{"lm loss": 4.86639738, "grad_norm": 0.28142285, "learning_rate": 3.13e-06, "elapsed_time_per_iteration": 6.44350433, "memory(GiB)": 21.51, "elapsed_time": "11h 21m 24s", "remaining_time": "15m 26s", "loss_scale": 1.0, "consumed_samples": 1592576, "global_step/max_steps": "6221/6362"} +{"lm loss": 4.89321375, "grad_norm": 0.24522045, "learning_rate": 3.13e-06, "elapsed_time_per_iteration": 6.73861647, "memory(GiB)": 21.51, "elapsed_time": "11h 21m 31s", "remaining_time": "15m 20s", "loss_scale": 1.0, "consumed_samples": 1592832, "global_step/max_steps": "6222/6362"} +{"lm loss": 4.87472153, "grad_norm": 0.25011292, "learning_rate": 3.13e-06, "elapsed_time_per_iteration": 7.62417483, "memory(GiB)": 21.51, "elapsed_time": "11h 21m 39s", "remaining_time": "15m 13s", "loss_scale": 1.0, "consumed_samples": 1593088, "global_step/max_steps": "6223/6362"} +{"lm loss": 4.87156725, "grad_norm": 0.24684697, "learning_rate": 3.12e-06, "elapsed_time_per_iteration": 6.43691206, "memory(GiB)": 21.51, "elapsed_time": "11h 21m 45s", "remaining_time": "15m 6s", "loss_scale": 1.0, "consumed_samples": 1593344, "global_step/max_steps": "6224/6362"} +{"lm loss": 4.88194895, "grad_norm": 0.2474477, "learning_rate": 3.12e-06, "elapsed_time_per_iteration": 6.54262376, "memory(GiB)": 21.51, "elapsed_time": "11h 21m 52s", "remaining_time": "15m 0s", "loss_scale": 1.0, "consumed_samples": 1593600, "global_step/max_steps": "6225/6362"} +{"lm loss": 4.87163973, "grad_norm": 0.26198655, "learning_rate": 3.12e-06, "elapsed_time_per_iteration": 6.43036318, "memory(GiB)": 21.51, "elapsed_time": "11h 21m 58s", "remaining_time": "14m 53s", "loss_scale": 1.0, "consumed_samples": 1593856, "global_step/max_steps": "6226/6362"} +{"lm loss": 4.88395786, "grad_norm": 0.26543137, "learning_rate": 3.12e-06, "elapsed_time_per_iteration": 6.59015942, "memory(GiB)": 21.51, "elapsed_time": "11h 22m 5s", "remaining_time": "14m 47s", "loss_scale": 1.0, "consumed_samples": 1594112, "global_step/max_steps": "6227/6362"} +{"lm loss": 4.84037685, "grad_norm": 0.2532129, "learning_rate": 3.12e-06, "elapsed_time_per_iteration": 6.44476032, "memory(GiB)": 21.51, "elapsed_time": "11h 22m 11s", "remaining_time": "14m 40s", "loss_scale": 1.0, "consumed_samples": 1594368, "global_step/max_steps": "6228/6362"} +{"lm loss": 4.88383245, "grad_norm": 0.24891466, "learning_rate": 3.12e-06, "elapsed_time_per_iteration": 6.41261864, "memory(GiB)": 21.51, "elapsed_time": "11h 22m 17s", "remaining_time": "14m 34s", "loss_scale": 1.0, "consumed_samples": 1594624, "global_step/max_steps": "6229/6362"} +{"lm loss": 4.860219, "grad_norm": 0.26480213, "learning_rate": 3.11e-06, "elapsed_time_per_iteration": 6.55170608, "memory(GiB)": 21.51, "elapsed_time": "11h 22m 24s", "remaining_time": "14m 27s", "loss_scale": 1.0, "consumed_samples": 1594880, "global_step/max_steps": "6230/6362"} +{"lm loss": 4.85963488, "grad_norm": 0.27036703, "learning_rate": 3.11e-06, "elapsed_time_per_iteration": 6.54616213, "memory(GiB)": 21.51, "elapsed_time": "11h 22m 31s", "remaining_time": "14m 20s", "loss_scale": 1.0, "consumed_samples": 1595136, "global_step/max_steps": "6231/6362"} +{"lm loss": 4.87480783, "grad_norm": 0.26671275, "learning_rate": 3.11e-06, "elapsed_time_per_iteration": 6.65519285, "memory(GiB)": 21.51, "elapsed_time": "11h 22m 37s", "remaining_time": "14m 14s", "loss_scale": 1.0, "consumed_samples": 1595392, "global_step/max_steps": "6232/6362"} +{"lm loss": 4.87697649, "grad_norm": 0.26274067, "learning_rate": 3.11e-06, "elapsed_time_per_iteration": 6.5774796, "memory(GiB)": 21.51, "elapsed_time": "11h 22m 44s", "remaining_time": "14m 7s", "loss_scale": 1.0, "consumed_samples": 1595648, "global_step/max_steps": "6233/6362"} +{"lm loss": 4.87098598, "grad_norm": 0.25755849, "learning_rate": 3.11e-06, "elapsed_time_per_iteration": 6.57122755, "memory(GiB)": 21.51, "elapsed_time": "11h 22m 50s", "remaining_time": "14m 1s", "loss_scale": 1.0, "consumed_samples": 1595904, "global_step/max_steps": "6234/6362"} +{"lm loss": 4.85470247, "grad_norm": 0.27053627, "learning_rate": 3.11e-06, "elapsed_time_per_iteration": 6.54285169, "memory(GiB)": 21.51, "elapsed_time": "11h 22m 57s", "remaining_time": "13m 54s", "loss_scale": 1.0, "consumed_samples": 1596160, "global_step/max_steps": "6235/6362"} +{"lm loss": 4.85868931, "grad_norm": 0.24744256, "learning_rate": 3.1e-06, "elapsed_time_per_iteration": 6.60852766, "memory(GiB)": 21.51, "elapsed_time": "11h 23m 3s", "remaining_time": "13m 48s", "loss_scale": 1.0, "consumed_samples": 1596416, "global_step/max_steps": "6236/6362"} +{"lm loss": 4.88491726, "grad_norm": 0.26634368, "learning_rate": 3.1e-06, "elapsed_time_per_iteration": 6.64366317, "memory(GiB)": 21.51, "elapsed_time": "11h 23m 10s", "remaining_time": "13m 41s", "loss_scale": 1.0, "consumed_samples": 1596672, "global_step/max_steps": "6237/6362"} +{"lm loss": 4.87105227, "grad_norm": 0.25128549, "learning_rate": 3.1e-06, "elapsed_time_per_iteration": 6.38123918, "memory(GiB)": 21.51, "elapsed_time": "11h 23m 17s", "remaining_time": "13m 34s", "loss_scale": 1.0, "consumed_samples": 1596928, "global_step/max_steps": "6238/6362"} +{"lm loss": 4.87818432, "grad_norm": 0.2633675, "learning_rate": 3.1e-06, "elapsed_time_per_iteration": 6.52321935, "memory(GiB)": 21.51, "elapsed_time": "11h 23m 23s", "remaining_time": "13m 28s", "loss_scale": 1.0, "consumed_samples": 1597184, "global_step/max_steps": "6239/6362"} +{"lm loss": 4.86134052, "grad_norm": 0.26619118, "learning_rate": 3.1e-06, "elapsed_time_per_iteration": 6.46045566, "memory(GiB)": 21.51, "elapsed_time": "11h 23m 29s", "remaining_time": "13m 21s", "loss_scale": 1.0, "consumed_samples": 1597440, "global_step/max_steps": "6240/6362"} +{"lm loss": 4.85059404, "grad_norm": 0.25013494, "learning_rate": 3.1e-06, "elapsed_time_per_iteration": 6.78384471, "memory(GiB)": 21.51, "elapsed_time": "11h 23m 36s", "remaining_time": "13m 15s", "loss_scale": 1.0, "consumed_samples": 1597696, "global_step/max_steps": "6241/6362"} +{"lm loss": 4.87898016, "grad_norm": 0.26502258, "learning_rate": 3.09e-06, "elapsed_time_per_iteration": 6.6355834, "memory(GiB)": 21.51, "elapsed_time": "11h 23m 43s", "remaining_time": "13m 8s", "loss_scale": 1.0, "consumed_samples": 1597952, "global_step/max_steps": "6242/6362"} +{"lm loss": 4.8647356, "grad_norm": 0.27387607, "learning_rate": 3.09e-06, "elapsed_time_per_iteration": 6.4731791, "memory(GiB)": 21.51, "elapsed_time": "11h 23m 49s", "remaining_time": "13m 2s", "loss_scale": 1.0, "consumed_samples": 1598208, "global_step/max_steps": "6243/6362"} +{"lm loss": 4.85429764, "grad_norm": 0.26332554, "learning_rate": 3.09e-06, "elapsed_time_per_iteration": 6.47001743, "memory(GiB)": 21.51, "elapsed_time": "11h 23m 56s", "remaining_time": "12m 55s", "loss_scale": 1.0, "consumed_samples": 1598464, "global_step/max_steps": "6244/6362"} +{"lm loss": 4.88374805, "grad_norm": 0.26677445, "learning_rate": 3.09e-06, "elapsed_time_per_iteration": 6.48609829, "memory(GiB)": 21.51, "elapsed_time": "11h 24m 2s", "remaining_time": "12m 48s", "loss_scale": 1.0, "consumed_samples": 1598720, "global_step/max_steps": "6245/6362"} +{"lm loss": 4.88539934, "grad_norm": 0.26016772, "learning_rate": 3.09e-06, "elapsed_time_per_iteration": 6.59004378, "memory(GiB)": 21.51, "elapsed_time": "11h 24m 9s", "remaining_time": "12m 42s", "loss_scale": 1.0, "consumed_samples": 1598976, "global_step/max_steps": "6246/6362"} +{"lm loss": 4.88237953, "grad_norm": 0.25252575, "learning_rate": 3.09e-06, "elapsed_time_per_iteration": 6.41121006, "memory(GiB)": 21.51, "elapsed_time": "11h 24m 15s", "remaining_time": "12m 35s", "loss_scale": 1.0, "consumed_samples": 1599232, "global_step/max_steps": "6247/6362"} +{"lm loss": 4.89204502, "grad_norm": 0.25059181, "learning_rate": 3.09e-06, "elapsed_time_per_iteration": 6.58104539, "memory(GiB)": 21.51, "elapsed_time": "11h 24m 22s", "remaining_time": "12m 29s", "loss_scale": 1.0, "consumed_samples": 1599488, "global_step/max_steps": "6248/6362"} +{"lm loss": 4.87351036, "grad_norm": 0.26213118, "learning_rate": 3.08e-06, "elapsed_time_per_iteration": 6.549896, "memory(GiB)": 21.51, "elapsed_time": "11h 24m 28s", "remaining_time": "12m 22s", "loss_scale": 1.0, "consumed_samples": 1599744, "global_step/max_steps": "6249/6362"} +{"lm loss": 4.86139631, "grad_norm": 0.26473236, "learning_rate": 3.08e-06, "elapsed_time_per_iteration": 6.54975605, "memory(GiB)": 21.51, "elapsed_time": "11h 24m 35s", "remaining_time": "12m 16s", "loss_scale": 1.0, "consumed_samples": 1600000, "global_step/max_steps": "6250/6362"} +{"lm loss": 4.85267973, "grad_norm": 0.26011321, "learning_rate": 3.08e-06, "elapsed_time_per_iteration": 6.51097155, "memory(GiB)": 21.51, "elapsed_time": "11h 24m 42s", "remaining_time": "12m 9s", "loss_scale": 1.0, "consumed_samples": 1600256, "global_step/max_steps": "6251/6362"} +{"lm loss": 4.87575293, "grad_norm": 0.25352302, "learning_rate": 3.08e-06, "elapsed_time_per_iteration": 6.58853316, "memory(GiB)": 21.51, "elapsed_time": "11h 24m 48s", "remaining_time": "12m 2s", "loss_scale": 1.0, "consumed_samples": 1600512, "global_step/max_steps": "6252/6362"} +{"lm loss": 4.85733747, "grad_norm": 0.24908592, "learning_rate": 3.08e-06, "elapsed_time_per_iteration": 6.58081603, "memory(GiB)": 21.51, "elapsed_time": "11h 24m 55s", "remaining_time": "11m 56s", "loss_scale": 1.0, "consumed_samples": 1600768, "global_step/max_steps": "6253/6362"} +{"lm loss": 4.88939667, "grad_norm": 0.25708729, "learning_rate": 3.08e-06, "elapsed_time_per_iteration": 6.61076117, "memory(GiB)": 21.51, "elapsed_time": "11h 25m 1s", "remaining_time": "11m 49s", "loss_scale": 1.0, "consumed_samples": 1601024, "global_step/max_steps": "6254/6362"} +{"lm loss": 4.86065054, "grad_norm": 0.27038446, "learning_rate": 3.07e-06, "elapsed_time_per_iteration": 6.47058105, "memory(GiB)": 21.51, "elapsed_time": "11h 25m 8s", "remaining_time": "11m 43s", "loss_scale": 1.0, "consumed_samples": 1601280, "global_step/max_steps": "6255/6362"} +{"lm loss": 4.87360287, "grad_norm": 0.26049522, "learning_rate": 3.07e-06, "elapsed_time_per_iteration": 6.87385392, "memory(GiB)": 21.51, "elapsed_time": "11h 25m 15s", "remaining_time": "11m 36s", "loss_scale": 1.0, "consumed_samples": 1601536, "global_step/max_steps": "6256/6362"} +{"lm loss": 4.85635185, "grad_norm": 0.25532037, "learning_rate": 3.07e-06, "elapsed_time_per_iteration": 6.57714009, "memory(GiB)": 21.51, "elapsed_time": "11h 25m 21s", "remaining_time": "11m 30s", "loss_scale": 1.0, "consumed_samples": 1601792, "global_step/max_steps": "6257/6362"} +{"lm loss": 4.87986898, "grad_norm": 0.27216318, "learning_rate": 3.07e-06, "elapsed_time_per_iteration": 6.65788531, "memory(GiB)": 21.51, "elapsed_time": "11h 25m 28s", "remaining_time": "11m 23s", "loss_scale": 1.0, "consumed_samples": 1602048, "global_step/max_steps": "6258/6362"} +{"lm loss": 4.85800314, "grad_norm": 0.25126526, "learning_rate": 3.07e-06, "elapsed_time_per_iteration": 6.61064124, "memory(GiB)": 21.51, "elapsed_time": "11h 25m 34s", "remaining_time": "11m 16s", "loss_scale": 1.0, "consumed_samples": 1602304, "global_step/max_steps": "6259/6362"} +{"lm loss": 4.86485338, "grad_norm": 0.26707074, "learning_rate": 3.07e-06, "elapsed_time_per_iteration": 6.63921642, "memory(GiB)": 21.51, "elapsed_time": "11h 25m 41s", "remaining_time": "11m 10s", "loss_scale": 1.0, "consumed_samples": 1602560, "global_step/max_steps": "6260/6362"} +{"lm loss": 4.84256887, "grad_norm": 0.26281023, "learning_rate": 3.07e-06, "elapsed_time_per_iteration": 6.61241317, "memory(GiB)": 21.51, "elapsed_time": "11h 25m 48s", "remaining_time": "11m 3s", "loss_scale": 1.0, "consumed_samples": 1602816, "global_step/max_steps": "6261/6362"} +{"lm loss": 4.86436415, "grad_norm": 0.24936566, "learning_rate": 3.07e-06, "elapsed_time_per_iteration": 6.50929976, "memory(GiB)": 21.51, "elapsed_time": "11h 25m 54s", "remaining_time": "10m 57s", "loss_scale": 1.0, "consumed_samples": 1603072, "global_step/max_steps": "6262/6362"} +{"lm loss": 4.85978985, "grad_norm": 0.25144348, "learning_rate": 3.06e-06, "elapsed_time_per_iteration": 6.35294151, "memory(GiB)": 21.51, "elapsed_time": "11h 26m 1s", "remaining_time": "10m 50s", "loss_scale": 1.0, "consumed_samples": 1603328, "global_step/max_steps": "6263/6362"} +{"lm loss": 4.84165573, "grad_norm": 0.25484586, "learning_rate": 3.06e-06, "elapsed_time_per_iteration": 6.71356463, "memory(GiB)": 21.51, "elapsed_time": "11h 26m 7s", "remaining_time": "10m 44s", "loss_scale": 1.0, "consumed_samples": 1603584, "global_step/max_steps": "6264/6362"} +{"lm loss": 4.89221954, "grad_norm": 0.25095645, "learning_rate": 3.06e-06, "elapsed_time_per_iteration": 6.8283155, "memory(GiB)": 21.51, "elapsed_time": "11h 26m 14s", "remaining_time": "10m 37s", "loss_scale": 1.0, "consumed_samples": 1603840, "global_step/max_steps": "6265/6362"} +{"lm loss": 4.8869319, "grad_norm": 0.260829, "learning_rate": 3.06e-06, "elapsed_time_per_iteration": 6.35667443, "memory(GiB)": 21.51, "elapsed_time": "11h 26m 21s", "remaining_time": "10m 30s", "loss_scale": 1.0, "consumed_samples": 1604096, "global_step/max_steps": "6266/6362"} +{"lm loss": 4.85755587, "grad_norm": 0.25646275, "learning_rate": 3.06e-06, "elapsed_time_per_iteration": 6.49773693, "memory(GiB)": 21.51, "elapsed_time": "11h 26m 27s", "remaining_time": "10m 24s", "loss_scale": 1.0, "consumed_samples": 1604352, "global_step/max_steps": "6267/6362"} +{"lm loss": 4.86754465, "grad_norm": 0.25630033, "learning_rate": 3.06e-06, "elapsed_time_per_iteration": 6.50944161, "memory(GiB)": 21.51, "elapsed_time": "11h 26m 34s", "remaining_time": "10m 17s", "loss_scale": 1.0, "consumed_samples": 1604608, "global_step/max_steps": "6268/6362"} +{"lm loss": 4.87451172, "grad_norm": 0.25986743, "learning_rate": 3.06e-06, "elapsed_time_per_iteration": 6.61715126, "memory(GiB)": 21.51, "elapsed_time": "11h 26m 40s", "remaining_time": "10m 11s", "loss_scale": 1.0, "consumed_samples": 1604864, "global_step/max_steps": "6269/6362"} +{"lm loss": 4.88419151, "grad_norm": 0.25203744, "learning_rate": 3.06e-06, "elapsed_time_per_iteration": 6.42123556, "memory(GiB)": 21.51, "elapsed_time": "11h 26m 47s", "remaining_time": "10m 4s", "loss_scale": 1.0, "consumed_samples": 1605120, "global_step/max_steps": "6270/6362"} +{"lm loss": 4.85130024, "grad_norm": 0.26580378, "learning_rate": 3.05e-06, "elapsed_time_per_iteration": 6.85777783, "memory(GiB)": 21.51, "elapsed_time": "11h 26m 53s", "remaining_time": "9m 58s", "loss_scale": 1.0, "consumed_samples": 1605376, "global_step/max_steps": "6271/6362"} +{"lm loss": 4.86423111, "grad_norm": 0.26213977, "learning_rate": 3.05e-06, "elapsed_time_per_iteration": 6.50670075, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 0s", "remaining_time": "9m 51s", "loss_scale": 1.0, "consumed_samples": 1605632, "global_step/max_steps": "6272/6362"} +{"lm loss": 4.84044456, "grad_norm": 0.24782734, "learning_rate": 3.05e-06, "elapsed_time_per_iteration": 6.35149097, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 6s", "remaining_time": "9m 44s", "loss_scale": 1.0, "consumed_samples": 1605888, "global_step/max_steps": "6273/6362"} +{"lm loss": 4.85797453, "grad_norm": 0.25404939, "learning_rate": 3.05e-06, "elapsed_time_per_iteration": 6.68862891, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 13s", "remaining_time": "9m 38s", "loss_scale": 1.0, "consumed_samples": 1606144, "global_step/max_steps": "6274/6362"} +{"lm loss": 4.86752892, "grad_norm": 0.25727183, "learning_rate": 3.05e-06, "elapsed_time_per_iteration": 6.53735185, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 19s", "remaining_time": "9m 31s", "loss_scale": 1.0, "consumed_samples": 1606400, "global_step/max_steps": "6275/6362"} +{"lm loss": 4.85188198, "grad_norm": 0.24001519, "learning_rate": 3.05e-06, "elapsed_time_per_iteration": 6.72695112, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 26s", "remaining_time": "9m 25s", "loss_scale": 1.0, "consumed_samples": 1606656, "global_step/max_steps": "6276/6362"} +{"lm loss": 4.87144518, "grad_norm": 0.24545121, "learning_rate": 3.05e-06, "elapsed_time_per_iteration": 6.47563839, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 33s", "remaining_time": "9m 18s", "loss_scale": 1.0, "consumed_samples": 1606912, "global_step/max_steps": "6277/6362"} +{"lm loss": 4.85656118, "grad_norm": 0.2538833, "learning_rate": 3.05e-06, "elapsed_time_per_iteration": 6.38195729, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 39s", "remaining_time": "9m 12s", "loss_scale": 1.0, "consumed_samples": 1607168, "global_step/max_steps": "6278/6362"} +{"lm loss": 4.87153196, "grad_norm": 0.24801145, "learning_rate": 3.05e-06, "elapsed_time_per_iteration": 6.59177923, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 46s", "remaining_time": "9m 5s", "loss_scale": 1.0, "consumed_samples": 1607424, "global_step/max_steps": "6279/6362"} +{"lm loss": 4.88776684, "grad_norm": 0.25378016, "learning_rate": 3.04e-06, "elapsed_time_per_iteration": 6.59267139, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 52s", "remaining_time": "8m 58s", "loss_scale": 1.0, "consumed_samples": 1607680, "global_step/max_steps": "6280/6362"} +{"lm loss": 4.83776855, "grad_norm": 0.24885619, "learning_rate": 3.04e-06, "elapsed_time_per_iteration": 6.44076419, "memory(GiB)": 21.51, "elapsed_time": "11h 27m 59s", "remaining_time": "8m 52s", "loss_scale": 1.0, "consumed_samples": 1607936, "global_step/max_steps": "6281/6362"} +{"lm loss": 4.85765553, "grad_norm": 0.25069258, "learning_rate": 3.04e-06, "elapsed_time_per_iteration": 6.59513569, "memory(GiB)": 21.51, "elapsed_time": "11h 28m 5s", "remaining_time": "8m 45s", "loss_scale": 1.0, "consumed_samples": 1608192, "global_step/max_steps": "6282/6362"} +{"lm loss": 4.88468885, "grad_norm": 0.25758818, "learning_rate": 3.04e-06, "elapsed_time_per_iteration": 6.61628532, "memory(GiB)": 21.51, "elapsed_time": "11h 28m 12s", "remaining_time": "8m 39s", "loss_scale": 1.0, "consumed_samples": 1608448, "global_step/max_steps": "6283/6362"} +{"lm loss": 4.86548281, "grad_norm": 0.24980219, "learning_rate": 3.04e-06, "elapsed_time_per_iteration": 6.65180492, "memory(GiB)": 21.51, "elapsed_time": "11h 28m 19s", "remaining_time": "8m 32s", "loss_scale": 1.0, "consumed_samples": 1608704, "global_step/max_steps": "6284/6362"} +{"lm loss": 4.87234497, "grad_norm": 0.25163558, "learning_rate": 3.04e-06, "elapsed_time_per_iteration": 6.8297801, "memory(GiB)": 21.51, "elapsed_time": "11h 28m 25s", "remaining_time": "8m 26s", "loss_scale": 1.0, "consumed_samples": 1608960, "global_step/max_steps": "6285/6362"} +{"lm loss": 4.85900497, "grad_norm": 0.25749326, "learning_rate": 3.04e-06, "elapsed_time_per_iteration": 6.62067199, "memory(GiB)": 21.51, "elapsed_time": "11h 28m 32s", "remaining_time": "8m 19s", "loss_scale": 1.0, "consumed_samples": 1609216, "global_step/max_steps": "6286/6362"} +{"lm loss": 4.85642481, "grad_norm": 0.25179696, "learning_rate": 3.04e-06, "elapsed_time_per_iteration": 6.46805406, "memory(GiB)": 21.51, "elapsed_time": "11h 28m 38s", "remaining_time": "8m 12s", "loss_scale": 1.0, "consumed_samples": 1609472, "global_step/max_steps": "6287/6362"} +{"lm loss": 4.87830687, "grad_norm": 0.25235954, "learning_rate": 3.04e-06, "elapsed_time_per_iteration": 6.62440228, "memory(GiB)": 21.51, "elapsed_time": "11h 28m 45s", "remaining_time": "8m 6s", "loss_scale": 1.0, "consumed_samples": 1609728, "global_step/max_steps": "6288/6362"} +{"lm loss": 4.86702204, "grad_norm": 0.25488484, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.70596576, "memory(GiB)": 21.51, "elapsed_time": "11h 28m 52s", "remaining_time": "7m 59s", "loss_scale": 1.0, "consumed_samples": 1609984, "global_step/max_steps": "6289/6362"} +{"lm loss": 4.87878704, "grad_norm": 0.25293979, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.72833848, "memory(GiB)": 21.51, "elapsed_time": "11h 28m 59s", "remaining_time": "7m 53s", "loss_scale": 1.0, "consumed_samples": 1610240, "global_step/max_steps": "6290/6362"} +{"lm loss": 4.8504405, "grad_norm": 0.25004447, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.61162233, "memory(GiB)": 21.51, "elapsed_time": "11h 29m 5s", "remaining_time": "7m 46s", "loss_scale": 1.0, "consumed_samples": 1610496, "global_step/max_steps": "6291/6362"} +{"lm loss": 4.86271238, "grad_norm": 0.26288271, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.53048277, "memory(GiB)": 21.51, "elapsed_time": "11h 29m 12s", "remaining_time": "7m 40s", "loss_scale": 1.0, "consumed_samples": 1610752, "global_step/max_steps": "6292/6362"} +{"lm loss": 4.8305378, "grad_norm": 0.25907224, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.50816751, "memory(GiB)": 21.51, "elapsed_time": "11h 29m 18s", "remaining_time": "7m 33s", "loss_scale": 1.0, "consumed_samples": 1611008, "global_step/max_steps": "6293/6362"} +{"lm loss": 4.86149073, "grad_norm": 0.24895146, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.75824356, "memory(GiB)": 21.51, "elapsed_time": "11h 29m 25s", "remaining_time": "7m 26s", "loss_scale": 1.0, "consumed_samples": 1611264, "global_step/max_steps": "6294/6362"} +{"lm loss": 4.8938241, "grad_norm": 0.25454283, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.69097948, "memory(GiB)": 21.51, "elapsed_time": "11h 29m 32s", "remaining_time": "7m 20s", "loss_scale": 1.0, "consumed_samples": 1611520, "global_step/max_steps": "6295/6362"} +{"lm loss": 4.86952591, "grad_norm": 0.25447834, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.50950718, "memory(GiB)": 21.51, "elapsed_time": "11h 29m 38s", "remaining_time": "7m 13s", "loss_scale": 1.0, "consumed_samples": 1611776, "global_step/max_steps": "6296/6362"} +{"lm loss": 4.87368202, "grad_norm": 0.26122668, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.36047935, "memory(GiB)": 21.51, "elapsed_time": "11h 29m 45s", "remaining_time": "7m 7s", "loss_scale": 1.0, "consumed_samples": 1612032, "global_step/max_steps": "6297/6362"} +{"lm loss": 4.85720491, "grad_norm": 0.25268269, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.4988575, "memory(GiB)": 21.51, "elapsed_time": "11h 29m 51s", "remaining_time": "7m 0s", "loss_scale": 1.0, "consumed_samples": 1612288, "global_step/max_steps": "6298/6362"} +{"lm loss": 4.86842299, "grad_norm": 0.26547921, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.41060114, "memory(GiB)": 21.51, "elapsed_time": "11h 29m 57s", "remaining_time": "6m 54s", "loss_scale": 1.0, "consumed_samples": 1612544, "global_step/max_steps": "6299/6362"} +{"lm loss": 4.86608315, "grad_norm": 0.266054, "learning_rate": 3.03e-06, "elapsed_time_per_iteration": 6.64739871, "memory(GiB)": 21.51, "elapsed_time": "11h 30m 4s", "remaining_time": "6m 47s", "loss_scale": 1.0, "consumed_samples": 1612800, "global_step/max_steps": "6300/6362"} +{"lm loss": 4.85158873, "grad_norm": 0.26980799, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.31954765, "memory(GiB)": 21.51, "elapsed_time": "11h 30m 10s", "remaining_time": "6m 40s", "loss_scale": 1.0, "consumed_samples": 1613056, "global_step/max_steps": "6301/6362"} +{"lm loss": 4.8454771, "grad_norm": 0.25570306, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.44395447, "memory(GiB)": 21.51, "elapsed_time": "11h 30m 17s", "remaining_time": "6m 34s", "loss_scale": 1.0, "consumed_samples": 1613312, "global_step/max_steps": "6302/6362"} +{"lm loss": 4.8635931, "grad_norm": 0.2623333, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.49651885, "memory(GiB)": 21.51, "elapsed_time": "11h 30m 23s", "remaining_time": "6m 27s", "loss_scale": 1.0, "consumed_samples": 1613568, "global_step/max_steps": "6303/6362"} +{"lm loss": 4.87982225, "grad_norm": 0.26208609, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.52872992, "memory(GiB)": 21.51, "elapsed_time": "11h 30m 30s", "remaining_time": "6m 21s", "loss_scale": 1.0, "consumed_samples": 1613824, "global_step/max_steps": "6304/6362"} +{"lm loss": 4.87177324, "grad_norm": 0.26373512, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.32784224, "memory(GiB)": 21.51, "elapsed_time": "11h 30m 36s", "remaining_time": "6m 14s", "loss_scale": 1.0, "consumed_samples": 1614080, "global_step/max_steps": "6305/6362"} +{"lm loss": 4.84288168, "grad_norm": 0.25856823, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.6195364, "memory(GiB)": 21.51, "elapsed_time": "11h 30m 43s", "remaining_time": "6m 8s", "loss_scale": 1.0, "consumed_samples": 1614336, "global_step/max_steps": "6306/6362"} +{"lm loss": 4.85341501, "grad_norm": 0.25798184, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.689363, "memory(GiB)": 21.51, "elapsed_time": "11h 30m 49s", "remaining_time": "6m 1s", "loss_scale": 1.0, "consumed_samples": 1614592, "global_step/max_steps": "6307/6362"} +{"lm loss": 4.86227226, "grad_norm": 0.24677724, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.61408305, "memory(GiB)": 21.51, "elapsed_time": "11h 30m 56s", "remaining_time": "5m 54s", "loss_scale": 1.0, "consumed_samples": 1614848, "global_step/max_steps": "6308/6362"} +{"lm loss": 4.87064505, "grad_norm": 0.25082985, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 7.19937944, "memory(GiB)": 21.51, "elapsed_time": "11h 31m 3s", "remaining_time": "5m 48s", "loss_scale": 1.0, "consumed_samples": 1615104, "global_step/max_steps": "6309/6362"} +{"lm loss": 4.87134886, "grad_norm": 0.25115663, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.59891295, "memory(GiB)": 21.51, "elapsed_time": "11h 31m 10s", "remaining_time": "5m 41s", "loss_scale": 1.0, "consumed_samples": 1615360, "global_step/max_steps": "6310/6362"} +{"lm loss": 4.85895634, "grad_norm": 0.25029197, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.50139976, "memory(GiB)": 21.51, "elapsed_time": "11h 31m 16s", "remaining_time": "5m 35s", "loss_scale": 1.0, "consumed_samples": 1615616, "global_step/max_steps": "6311/6362"} +{"lm loss": 4.88555241, "grad_norm": 0.24983747, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.60936928, "memory(GiB)": 21.51, "elapsed_time": "11h 31m 23s", "remaining_time": "5m 28s", "loss_scale": 1.0, "consumed_samples": 1615872, "global_step/max_steps": "6312/6362"} +{"lm loss": 4.8520093, "grad_norm": 0.26198912, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.50515652, "memory(GiB)": 21.51, "elapsed_time": "11h 31m 30s", "remaining_time": "5m 22s", "loss_scale": 1.0, "consumed_samples": 1616128, "global_step/max_steps": "6313/6362"} +{"lm loss": 4.89100552, "grad_norm": 0.26929089, "learning_rate": 3.02e-06, "elapsed_time_per_iteration": 6.59220409, "memory(GiB)": 21.51, "elapsed_time": "11h 31m 36s", "remaining_time": "5m 15s", "loss_scale": 1.0, "consumed_samples": 1616384, "global_step/max_steps": "6314/6362"} +{"lm loss": 4.88794804, "grad_norm": 0.2690939, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.39236259, "memory(GiB)": 21.51, "elapsed_time": "11h 31m 43s", "remaining_time": "5m 8s", "loss_scale": 1.0, "consumed_samples": 1616640, "global_step/max_steps": "6315/6362"} +{"lm loss": 4.88259935, "grad_norm": 0.27065286, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.5795393, "memory(GiB)": 21.51, "elapsed_time": "11h 31m 49s", "remaining_time": "5m 2s", "loss_scale": 1.0, "consumed_samples": 1616896, "global_step/max_steps": "6316/6362"} +{"lm loss": 4.86839151, "grad_norm": 0.26209342, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.43360066, "memory(GiB)": 21.51, "elapsed_time": "11h 31m 56s", "remaining_time": "4m 55s", "loss_scale": 1.0, "consumed_samples": 1617152, "global_step/max_steps": "6317/6362"} +{"lm loss": 4.88983297, "grad_norm": 0.26099491, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.3691361, "memory(GiB)": 21.51, "elapsed_time": "11h 32m 2s", "remaining_time": "4m 49s", "loss_scale": 1.0, "consumed_samples": 1617408, "global_step/max_steps": "6318/6362"} +{"lm loss": 4.88559198, "grad_norm": 0.27090862, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.59776807, "memory(GiB)": 21.51, "elapsed_time": "11h 32m 8s", "remaining_time": "4m 42s", "loss_scale": 1.0, "consumed_samples": 1617664, "global_step/max_steps": "6319/6362"} +{"lm loss": 4.87869549, "grad_norm": 0.2538397, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.37922788, "memory(GiB)": 21.51, "elapsed_time": "11h 32m 15s", "remaining_time": "4m 36s", "loss_scale": 1.0, "consumed_samples": 1617920, "global_step/max_steps": "6320/6362"} +{"lm loss": 4.8667984, "grad_norm": 0.26324111, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.70436215, "memory(GiB)": 21.51, "elapsed_time": "11h 32m 22s", "remaining_time": "4m 29s", "loss_scale": 1.0, "consumed_samples": 1618176, "global_step/max_steps": "6321/6362"} +{"lm loss": 4.85179472, "grad_norm": 0.27692547, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.75771594, "memory(GiB)": 21.51, "elapsed_time": "11h 32m 28s", "remaining_time": "4m 22s", "loss_scale": 1.0, "consumed_samples": 1618432, "global_step/max_steps": "6322/6362"} +{"lm loss": 4.86193848, "grad_norm": 0.26579106, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.65991449, "memory(GiB)": 21.51, "elapsed_time": "11h 32m 35s", "remaining_time": "4m 16s", "loss_scale": 1.0, "consumed_samples": 1618688, "global_step/max_steps": "6323/6362"} +{"lm loss": 4.85719681, "grad_norm": 0.2638177, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.70362663, "memory(GiB)": 21.51, "elapsed_time": "11h 32m 42s", "remaining_time": "4m 9s", "loss_scale": 1.0, "consumed_samples": 1618944, "global_step/max_steps": "6324/6362"} +{"lm loss": 4.87708616, "grad_norm": 0.25702006, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.5908308, "memory(GiB)": 21.51, "elapsed_time": "11h 32m 48s", "remaining_time": "4m 3s", "loss_scale": 1.0, "consumed_samples": 1619200, "global_step/max_steps": "6325/6362"} +{"lm loss": 4.8571043, "grad_norm": 0.27729112, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.69132209, "memory(GiB)": 21.51, "elapsed_time": "11h 32m 55s", "remaining_time": "3m 56s", "loss_scale": 1.0, "consumed_samples": 1619456, "global_step/max_steps": "6326/6362"} +{"lm loss": 4.87298775, "grad_norm": 0.27249235, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.75086141, "memory(GiB)": 21.51, "elapsed_time": "11h 33m 2s", "remaining_time": "3m 50s", "loss_scale": 1.0, "consumed_samples": 1619712, "global_step/max_steps": "6327/6362"} +{"lm loss": 4.85414553, "grad_norm": 0.26718643, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.44475985, "memory(GiB)": 21.51, "elapsed_time": "11h 33m 8s", "remaining_time": "3m 43s", "loss_scale": 1.0, "consumed_samples": 1619968, "global_step/max_steps": "6328/6362"} +{"lm loss": 4.89048529, "grad_norm": 0.26877818, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.33669734, "memory(GiB)": 21.51, "elapsed_time": "11h 33m 15s", "remaining_time": "3m 36s", "loss_scale": 1.0, "consumed_samples": 1620224, "global_step/max_steps": "6329/6362"} +{"lm loss": 4.87358379, "grad_norm": 0.25711817, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.5073235, "memory(GiB)": 21.51, "elapsed_time": "11h 33m 21s", "remaining_time": "3m 30s", "loss_scale": 1.0, "consumed_samples": 1620480, "global_step/max_steps": "6330/6362"} +{"lm loss": 4.90545511, "grad_norm": 0.27108508, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.63397861, "memory(GiB)": 21.51, "elapsed_time": "11h 33m 28s", "remaining_time": "3m 23s", "loss_scale": 1.0, "consumed_samples": 1620736, "global_step/max_steps": "6331/6362"} +{"lm loss": 4.8896656, "grad_norm": 0.26119661, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.82543421, "memory(GiB)": 21.51, "elapsed_time": "11h 33m 34s", "remaining_time": "3m 17s", "loss_scale": 1.0, "consumed_samples": 1620992, "global_step/max_steps": "6332/6362"} +{"lm loss": 4.86179781, "grad_norm": 0.2587744, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.49240375, "memory(GiB)": 21.51, "elapsed_time": "11h 33m 41s", "remaining_time": "3m 10s", "loss_scale": 1.0, "consumed_samples": 1621248, "global_step/max_steps": "6333/6362"} +{"lm loss": 4.87614584, "grad_norm": 0.25205469, "learning_rate": 3.01e-06, "elapsed_time_per_iteration": 6.4596045, "memory(GiB)": 21.51, "elapsed_time": "11h 33m 47s", "remaining_time": "3m 4s", "loss_scale": 1.0, "consumed_samples": 1621504, "global_step/max_steps": "6334/6362"} +{"lm loss": 4.85976171, "grad_norm": 0.26155141, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.50199175, "memory(GiB)": 21.51, "elapsed_time": "11h 33m 54s", "remaining_time": "2m 57s", "loss_scale": 1.0, "consumed_samples": 1621760, "global_step/max_steps": "6335/6362"} +{"lm loss": 4.84472895, "grad_norm": 0.25685519, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.41858172, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 0s", "remaining_time": "2m 50s", "loss_scale": 1.0, "consumed_samples": 1622016, "global_step/max_steps": "6336/6362"} +{"lm loss": 4.88902378, "grad_norm": 0.26375875, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.57731724, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 7s", "remaining_time": "2m 44s", "loss_scale": 1.0, "consumed_samples": 1622272, "global_step/max_steps": "6337/6362"} +{"lm loss": 4.90118027, "grad_norm": 0.25478187, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.52611256, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 13s", "remaining_time": "2m 37s", "loss_scale": 1.0, "consumed_samples": 1622528, "global_step/max_steps": "6338/6362"} +{"lm loss": 4.86335754, "grad_norm": 0.24568807, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.61244512, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 20s", "remaining_time": "2m 31s", "loss_scale": 1.0, "consumed_samples": 1622784, "global_step/max_steps": "6339/6362"} +{"lm loss": 4.87164831, "grad_norm": 0.25496221, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.49957013, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 27s", "remaining_time": "2m 24s", "loss_scale": 1.0, "consumed_samples": 1623040, "global_step/max_steps": "6340/6362"} +{"lm loss": 4.87910748, "grad_norm": 0.26700816, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.50813079, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 33s", "remaining_time": "2m 18s", "loss_scale": 1.0, "consumed_samples": 1623296, "global_step/max_steps": "6341/6362"} +{"lm loss": 4.85412788, "grad_norm": 0.25941166, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.49385715, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 40s", "remaining_time": "2m 11s", "loss_scale": 1.0, "consumed_samples": 1623552, "global_step/max_steps": "6342/6362"} +{"lm loss": 4.85729313, "grad_norm": 0.24571414, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.4865427, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 46s", "remaining_time": "2m 4s", "loss_scale": 1.0, "consumed_samples": 1623808, "global_step/max_steps": "6343/6362"} +{"lm loss": 4.88241148, "grad_norm": 0.2416711, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.53596091, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 53s", "remaining_time": "1m 58s", "loss_scale": 1.0, "consumed_samples": 1624064, "global_step/max_steps": "6344/6362"} +{"lm loss": 4.86323166, "grad_norm": 0.25753328, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.40077996, "memory(GiB)": 21.51, "elapsed_time": "11h 34m 59s", "remaining_time": "1m 51s", "loss_scale": 1.0, "consumed_samples": 1624320, "global_step/max_steps": "6345/6362"} +{"lm loss": 4.85882092, "grad_norm": 0.26733419, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.61991429, "memory(GiB)": 21.51, "elapsed_time": "11h 35m 6s", "remaining_time": "1m 45s", "loss_scale": 1.0, "consumed_samples": 1624576, "global_step/max_steps": "6346/6362"} +{"lm loss": 4.85878611, "grad_norm": 0.25600559, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.47676706, "memory(GiB)": 21.51, "elapsed_time": "11h 35m 12s", "remaining_time": "1m 38s", "loss_scale": 1.0, "consumed_samples": 1624832, "global_step/max_steps": "6347/6362"} +{"lm loss": 4.84133244, "grad_norm": 0.25695032, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.49308825, "memory(GiB)": 21.51, "elapsed_time": "11h 35m 19s", "remaining_time": "1m 32s", "loss_scale": 1.0, "consumed_samples": 1625088, "global_step/max_steps": "6348/6362"} +{"lm loss": 4.86371851, "grad_norm": 0.24358527, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.52201414, "memory(GiB)": 21.51, "elapsed_time": "11h 35m 25s", "remaining_time": "1m 25s", "loss_scale": 1.0, "consumed_samples": 1625344, "global_step/max_steps": "6349/6362"} +{"lm loss": 4.87213278, "grad_norm": 0.25444475, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.54930496, "memory(GiB)": 21.51, "elapsed_time": "11h 35m 32s", "remaining_time": "1m 18s", "loss_scale": 1.0, "consumed_samples": 1625600, "global_step/max_steps": "6350/6362"} +{"lm loss": 4.85728884, "grad_norm": 0.26343903, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.45327091, "memory(GiB)": 21.51, "elapsed_time": "11h 35m 38s", "remaining_time": "1m 12s", "loss_scale": 1.0, "consumed_samples": 1625856, "global_step/max_steps": "6351/6362"} +{"lm loss": 4.85075808, "grad_norm": 0.26018319, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.62725759, "memory(GiB)": 21.51, "elapsed_time": "11h 35m 45s", "remaining_time": "1m 5s", "loss_scale": 1.0, "consumed_samples": 1626112, "global_step/max_steps": "6352/6362"} +{"lm loss": 4.87475348, "grad_norm": 0.25670764, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.81020284, "memory(GiB)": 21.51, "elapsed_time": "11h 35m 52s", "remaining_time": "59s", "loss_scale": 1.0, "consumed_samples": 1626368, "global_step/max_steps": "6353/6362"} +{"lm loss": 4.85544062, "grad_norm": 0.24622947, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.35615683, "memory(GiB)": 21.51, "elapsed_time": "11h 35m 58s", "remaining_time": "52s", "loss_scale": 1.0, "consumed_samples": 1626624, "global_step/max_steps": "6354/6362"} +{"lm loss": 4.87168074, "grad_norm": 0.25220147, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.36283255, "memory(GiB)": 21.51, "elapsed_time": "11h 36m 4s", "remaining_time": "46s", "loss_scale": 1.0, "consumed_samples": 1626880, "global_step/max_steps": "6355/6362"} +{"lm loss": 4.87017584, "grad_norm": 0.25670332, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.46382546, "memory(GiB)": 21.51, "elapsed_time": "11h 36m 11s", "remaining_time": "39s", "loss_scale": 1.0, "consumed_samples": 1627136, "global_step/max_steps": "6356/6362"} +{"lm loss": 4.86277151, "grad_norm": 0.25644091, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.61350536, "memory(GiB)": 21.51, "elapsed_time": "11h 36m 17s", "remaining_time": "32s", "loss_scale": 1.0, "consumed_samples": 1627392, "global_step/max_steps": "6357/6362"} +{"lm loss": 4.86824894, "grad_norm": 0.2459643, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.62822962, "memory(GiB)": 21.51, "elapsed_time": "11h 36m 24s", "remaining_time": "26s", "loss_scale": 1.0, "consumed_samples": 1627648, "global_step/max_steps": "6358/6362"} +{"lm loss": 4.84354162, "grad_norm": 0.2465369, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.48481679, "memory(GiB)": 21.51, "elapsed_time": "11h 36m 30s", "remaining_time": "19s", "loss_scale": 1.0, "consumed_samples": 1627904, "global_step/max_steps": "6359/6362"} +{"lm loss": 4.86155462, "grad_norm": 0.27435771, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.60013843, "memory(GiB)": 21.51, "elapsed_time": "11h 36m 37s", "remaining_time": "13s", "loss_scale": 1.0, "consumed_samples": 1628160, "global_step/max_steps": "6360/6362"} +{"lm loss": 4.87701464, "grad_norm": 0.26929015, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.43947744, "memory(GiB)": 21.51, "elapsed_time": "11h 36m 43s", "remaining_time": "6s", "loss_scale": 1.0, "consumed_samples": 1628416, "global_step/max_steps": "6361/6362"} +{"lm loss": 4.85249853, "grad_norm": 0.25826731, "learning_rate": 3e-06, "elapsed_time_per_iteration": 6.73206997, "memory(GiB)": 21.51, "elapsed_time": "11h 36m 50s", "remaining_time": "0s", "loss_scale": 1.0, "consumed_samples": 1628672, "global_step/max_steps": "6362/6362"}