diff --git a/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43fb248f7f80ed4fe1467a169758f365c164d6ba --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:566aee193e2f8127faef1d276540c6904f01057ea1bcb40275d2de3aa562efac +size 208731415 diff --git a/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..435f6a2d98606032a8c2896c3681c3b5060ea014 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:753e4194b47d1d0232c73978700e27e734c9b665aa351fdcabe6523d5c1396cc +size 208731415 diff --git a/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a66a5984ea3d94e700d4cf86b65b90b3d324465 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682a5e5280bbbf92e7650d408a9520fb3f9382fdbf8bc0ac0d79cf6f9c671c50 +size 208732183 diff --git a/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0910bc20925aa37fd9189455e600ef10cb9add87 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f68d9de00f54224f5b770618b12989f8d6eee15604d101c251011831ab3d71fc +size 208732183 diff --git a/global_step5494/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82a5b8b31e588c5a1d1563cf0edff91ddf0a816b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:854df9755355942cdaa589cee126131ad576872a79a36a7a06188ac8486b0929 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14e05dfc93f5a26b8e34742500623154ce7eb5a2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:035b86a4949eacf3a42ff0116ba9ccdde2fcf18d88ffb9a941c75141650954f6 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..583899a31ae0bc67a229ef83e8cd737c5d6dbccb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ef0aa454a83ea7669d2e508f50906c87973132bfa493a6ac0d749de3e9ac307 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da2111ba1608cb2adc24c0bbcb0e2e423c28fed9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d182b8c073bb01d89801508101ffef78b1b29db1c39592fd04dd43d24ef862e +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a9a68a1a4cedd1bb47e8f8d1398b6abcd356e2a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11292533bbf537c4592ab55368bc2bf94758a40206ba6641a05c9e64bade7fd6 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..22939a4431da2c72ed119a00ec60948077eb3870 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bac60c238ae2f37897455609bc4b9af577785e8885284e27344f2ecfb9cfbc1 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1baacad7fc90f53237c628eaf40e4fbbfb06c2e8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5e08c29830c004e838a1efe353ca26a05cfb2ba6c369fa70846355bec48cabe +size 208732077 diff --git a/global_step5494/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79611c036b55ebd27e6b7bda20f2d28426287803 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e10397c4ceba96b1a8b11a655bdac3c42a86afbd606d38310dcb691a60e14dc1 +size 208732077 diff --git a/global_step5494/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c70b41a2302d94653fda97d87c144a4c99055242 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:303c3fce32626ef02b43fc9294298cfcd1f6914be92dfe7e9246a6740ead311e +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4100be2a8f1d078b5bd9d8ad9e3857956047d5cb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f4c79cd6467489ea4a8c71d894de8c54f75afa986155ab4b80c1baeb284b1b +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec730a974540ae0d4dfc3b494895d085d75197ad --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c089d8b328c497e0ae0dfd7e348955329e45572e4aff06ca6a83889ac141d9d +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..582386c031cdaef795d612b29fd860f2bac74071 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34946349cc299dafded05ab1d1fc1ca2652fbfee1870d629e7617d256d7797c0 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f0bd3e0742faae704477b6406d5044e51beda21 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa9341f34fe3eba36548c6d9b586b61de41ad2f3a9cd727a76d9890aec852261 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d235be343e498bf748222cf64cce4ec03685cb7d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1ece68bd846fbc2387f421d8109297bb81d410e47d674d36645cc1ad6a751bd +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85b241d582e4d61395dee09149894c50256a873e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fe7bcf97dc58d252539b58ad290a95a3d36b946fbcd0fcbb39fcab4ac5e8542 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..321e9dbe0df9cc485cb943c1c94fd74c9069be4d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:953f0866fb0254245f3844af13ffffdd7d142b582c729e1a84d2bdf709cc98db +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20834cac7003867845baa4552cf6c897b3c55280 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:601eafe20ebdbd4868ac949f2672c23d85f7234501e951a6df9e15a6047b8083 +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7320c686889ee58bd32cefae5ef4747798a4e12b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f5b12ec9257bcb702a7097f9f8009ae9154c2766e4c8070e2eca86685b3073f +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8f2f707f79966e0a7d5170679d90429904f7ab6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36ae4188dcbd997368831ddac52cd780508ed8992bbf5ed542af6b4bf5428995 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be3ff793107731c494dcb5ab336cdb8d8319decd --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05ff00c3c06d85c97a49741b5676b00cd61eb28c1f8c18e09c29615a4d4c7150 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..494bd51c1ae58cd101c4d21363e909f43cda2c71 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:771706f6ffdc1b3f57f82d2b590077f2959a55c1184621c73f5f2559204322fd +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..846548cac7620c28993ab36ab2243c5726e611e4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d0afbabe5c3d8969056c49d095af6cbb0f92a8c4c6d89623a5bda6c6c4f436f +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edd090a78ad431c340e7a4142789540bdaa604f1 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9327beb3f146736350a170df683d511b967ff0a2e922e0cb5d2de708e32a8a3e +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b10065c0e997727c5f935481446e6d584d17aa4c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30a68584d27965d8b506829aa4d77db3facab53cb15f62fc26d877e6a8f4c4eb +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dda2408e167913faab5e18ed92a8120eb3299320 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0c278813b30a206051bc3f1450588892c9c2b974cdb447f7539201714ccfd35 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1cea8b6ce963a31d8e7d2c4018efa65b74a39239 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8824ee8a492fd86d73f60057d15bd7f4d176000448b665e3c44d74cc4328a1f2 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..063c433c9e6c3611f51c8ea7d4142be0bbbb08d4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:503de2f843fae705d8c5ffdd7346a428f0b28cd1811e0f7dc34766e5cee36ad8 +size 208732269 diff --git a/global_step5494/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38cd0b4e520adacc0ccabe7f81c09a20e803b2fb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0204193a3751f31b699ffdbcf591c3c41cc4d56474003d38c6f20638aed01cb4 +size 208732269 diff --git a/global_step5494/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..608f66346c857bdac09919dbf9163d14ceb6a5fe --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e44148c735566b299b48ff3f421bcb518d5d8049dcdfd186156b043bd8382c1 +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca0f7e06a234b4bf39037e19531ac9e5a26c685a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c8d4bdc60f223da490acf93663047feadc6361a5dad86568fc4559ec57b0574 +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..009a9351fafb868e1da5cc022fc4e2839bef7969 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b05e8e69f18e0dd4c13fcbc3d0c54f58e42ea84725cd768ec4d65c15947edc21 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f3ab9e73da5b542c476b1976a9c8520e0f40b53 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d41835d98f0455ae2c03a16d84d3807614b6c89841ec5a51bcc41dbe76432256 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..078a249f0b7850afb82d8fdf0e5f4df9830f653f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d0f6d53fa99e17532f3472fef46f7f4438b48ce04040d537f4745825166b05a +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5746bcdea5a594651ed51f405df8225cb0c93aed --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbdafc10e39d46138619cfe10673c86eb27c618f3386bb57232e15ab93c1668e +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7bf77f261eeddb6ec1b4ade7feb78051d022599 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9f175e1ca65c0ab9c995c3d53c2b0bb5a0da74380aed7267cf308c754bd1fe0 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ead27082a9e9353016ef2ee5b805fece3d3679e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9dedabc562f57450e6ee1faac8011b3bdb11064ebe6a5fced72e8f2de1ff564 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c5cb52ae51667c843d91f795cabf4024d103440 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d767b226227632e9fd4807693ec44c4871cedbc394c953b1a892c2d68ccdf972 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd774f66566052a547e525ec985fbef38dfeea32 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4933d007a26f0bdb19652eb2f76eaea093f5b543b34376bb454aa2785b71b8 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cba2c29dd30929ddadb497a3910e8f15e802ca4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ee62950ab26d779af609d5294e15a86ad03e63c67bbc72abd51a836df94ee3e +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5af0f6aa0ea6f4b3cdeaae601ff1f78990da4957 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d7141874e0fa2ccb40ba3f61d1521ab6f3d3c58d1d0aa11d274f5e3faaee542 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99fc3af53388bbfc27cf53fd51b449948189d100 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd075fd8918ccc0928faf91ff07bbd00f0a96263bb5c5b209740100c975bbbde +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bd05e7033c6d31aae0f6d3bf2a2186d06ffdf6e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57c8b6a542d70af39ebcb66ed2ae46de4dd575c90116e8228f9c1e904d289ccd +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e67ba2187ce817ff3a08f60629122aea94dc2d3 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb640335f8e43f14f42da50934356cb2e6eea7e7611b8b780f00052fc1f97d4 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..641daa4850af64f1aaa0ca3ec2e5f9a8e6b03658 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc9a48d5c4b98964c2afd0d2ac7c359b15943dc2ccff4f519fdd8e1925dbc95 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbf6ddc3691fdc057683583adb67952cd35d2bdb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abc91951960e8fe5313f79c2cb1c6044dd131cd93ebae98e6c9840761e1c3e07 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f998a5a871bd89aa18f4dc6567aeeb76a11f7ef --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f07d899fb19df744a77d40e66c4134dc6aeb8d74342406658a167a0a43775601 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9e98aa4b7f20578542f7747628349e8b8c967a0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50fe67f616eff992e6b7592ffd09e6c5dadfda244afe559a247911d9dedb4fac +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ada68266652c776f43e9bd81499e405c036cc3fe --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:504fd3a1108cd9cf65403e873233d674a2187eba2bf941e34af9d7d7dd80d6ce +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bd0d0329a3cbbf2cb36f7531e9a57e2bb152341 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cd37301a2b96f32609c8b5cd546cf2b19907ecb5b89794349637fda6024d510 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..836a1009246829d16690afb94a2cfdea12e1b28a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11cc55575f705c20e1e1c84c893c99e42f4e467f6f449edb132fac6a15bc41cf +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35b59c7e07d1da657d2fa7ae2d852cd049ad34ab --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68630448f3db2f446c6bd2f5082f994542bc02be2b89217a783a027406dddde2 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8258fdf5b08cf0979feca0a2940fce980f9cd13 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91897fbf9a38d30462afef090970c1792e05fff133901ce5a365af83a1666ad1 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5b0fc0f5c5db7a9f013a62aeae179e83913106d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d7b3599f3dd011ade19aefba780e9dac459acf871c3932f03d7c4b762043590 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e682ef880272effcc56c9b6863c0fdeba50fcc74 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10f9a1b1a2c348c07a1610881a284d6c0b7abf43aad4e5dc2a6d471cb857d01a +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c6b19bc8463daf6a702b9c93e613fc240c95574 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4370355c028c97d87a9c89b1b9cc39e2981240efd3442d1d6eacdbf01d7c9e5 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af1152c83c6a2cb7fe087436408adbeb116abe3b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1be80d4059dbbeae05f30b1cd266256e314019467d9fe3245a794b2dfb2e21d9 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..733e9c8da94fbcfacc66322dbdfa6a9d38261af9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ef6ac61675b019e23c29d44abea3f9f8db7b16183638b0a7f7fe52c04caa822 +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0e6424a8b2e3a06aa6d589b53b961536585fbcb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9523e317c17a8ff1b5c0391bc0e2185b0ab77155aec8ecad6547a2541eedbac +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37c76cc944199087141d050e0f13f184970311c2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df7f4538c0668b641e4f697e9a92df1481644b704bca925bd71dcf25bda9bbaa +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea850c742b742cd71936edc020ee4f30cba5fad1 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4beb5a59feeff83970db65bf60f3c83a24c03ee1770992a75adcf17cc514e359 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15af4c5154338b99b3d895d7793fcc7d3cb453d0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50b660b5ee4cd515212d96b99fb8f6cbb2678e81c595d869b06e9c146c19b5ed +size 208731309 diff --git a/global_step5494/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f5e170fb67d867f10b34d6f4cfb8c6fb3d83068 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5634e616087322a4c1068c9829e5e0065adcc16a32e719332cbbbb7d7929ceca +size 208731309 diff --git a/global_step5494/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75b18edb9319899a07ffccd1931ebadd6803181d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b862f38b314e08789e9bac46ab99f1a3a8dc7be09c60aa5a49f833aeb2bef086 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..47707b84c965797e6e48fcd4f50317054c807936 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe5fa4440f0d5a830dfe747ad28e4fc187b8feba0e55dd685f7255900d617a1a +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef026174822e0eb7822b4b20a715d4f6d6cf5cab --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae75cce528d568984b45f2a64835e4e70b15d34f05a22d5b1b1fe16c55b74c77 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70064c0fba9c13a73ad7b7b445164af1c212d13e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83906d84010d1d0b1df2cfd4ea9c40a8d377f6f6378b5cb470a99dc53b6cdffd +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45811c4f8109858356cef7560a67160ce1953812 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac488beb895958059c31b5bb0a6014df6bb8d54c86f4b78c3bf0d951b85fb3e5 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b10fe31c294e6620cb854bad7a86657a413a1e18 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5351827cf7fc17fffff0a19047757100d6d57afeae2e274fc25f3ccd0ce030c1 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddbb6a0fcbb17998bb4404a07202e87b5a8aaea4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826873382c52a9aa8a9006c45b5d4368fdc3098ae9fec34f030de5ab6551c0ba +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b791575fd7db6784b4f34fd6035165870560ac8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc09cb31e2ba4dc1c82371bf906b42750b2471c6bb28a411041f27de349251be +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bb5869bee7a122491f8ca449f5507d81d9864f7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aca22ebee1a0fda380c6699ca898b32734c9195478e7b281a6d940c492f36c1 +size 208732269 diff --git a/global_step5494/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83f12e15b5973f15d0586706bd2946fb4eff3bf5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6228f3996ee57e3f743ad64ddb56f647a2f5df8aeba430cb064ca2f18946db04 +size 208732269 diff --git a/global_step5494/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31544a53866b705f7a79bfb7c0008894b5848db4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77bba9670021c4525c4ef6780865be7b998bb1b226faeb97031888d96a84c1af +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cb53ed692daa80c342d4eed717c08e1637965f9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53f5114e44ba63631c6941f5e106de195c06523cf040c24a69e3d7a6a410d576 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0b40848a297c5ad5d7143606e891adfa107c632 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a80853f846f9110b1fa0f36d3aa9370e020d293a20ccf483fa6486a166a6472c +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44790ee9e5400a1a08f797515ac7f941e234d552 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98a7ea2a68777dd9c260949dc412c7991c3abda19338d41ebda69ced7809a88e +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc05e5cdd9f863d3d5464a59ae6fa4caecd1a566 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da7ab0769924cc7d3f98f6da247e4459c60b3090902c46484ef751ca053ae886 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..90487570b18a5d0acdeba6dbf3bc0da6490c812f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8324d9629430a484d02710ce74910a30ef250b3ebc4e8a9f19599801d88561ad +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..984aaaf28179acaa528c8acbdf35d4e1302fec1d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0beb6bb17c8d52d809a8469a83db1552764cbe4bc8765244a0bbd1116e718b6 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..efb407dfc32fbed13c8d07275483ce63f0ad031e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fc6e4cf29a6db164969b83be34a0afca13d00cec87571a9a35a1271650b17b8 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e749e485816953cf50b316292822f631de9e2985 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85cc223a19f4bf1f27db068f3a4790ff73e6bf905a6cb0a71d13b0d79273085a +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06c7018276a07b856fe0bb7c50b883fd7dfe4161 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a585f5161be40d43edc28c9e008042ee4bcb87b1ac73baa7f69bd2917a801889 +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1b6ec92b5b3a9a07b649c80231b5ca3c7e9a0fc --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb33cbaf5b90f88b82ce6467074328dd06e04e252308897be734b56039438e41 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f861d7b5db7f3495b8c56c15b5b2fddc908eebe9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:475b18219c1d5eebdff17e2cc88a59f0f8c83fa3cb58d36e6e0974ec964e6d11 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc52e69d889a0562e6e7e48703399974ad9ffbfa --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7e9cf544d08253d136be3ed47944dc7f9a1b37dacf8c64cfe1d57d957f46732 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78aa9e58997653d7ac90de7085644193807be069 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09d046ba2578f7b414fd682dccd616af8354d0d4f6767f3e14a4cbad28e93ae7 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..570b5bf1eeb8c549e6fbe4957aedaa7a14e669ea --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d27f216337d14c02bf0a2a722c60f097615f413d633a774876c6673245258c76 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ebe1f108f4c55c308c59d96095bebfd77a8d8b5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b05c7af63fc10629c89302fe5ad83444bd91269dad3166b1a925366c1e69f193 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c1b0ec8ba05d9ce3e022a38d0dba163199820bf --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0ca6d54622fba3ac9ecedc154567a1fc6738664244bea5d201e00a2f15d54d0 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c370d9a108809e650fe3588d70250eb708cdafa7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea6ac51f53e267da40235e8418a3ba05c23227ad33c85cea3d72576b2f1eb2dd +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d279805b72b154ad019d3e23ea3dbcd5a3104cc5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed43d8d26840635ad4d8ee4b12d0266915d0d10f82f1f0f4541004d940c5815 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..104a05fee523341b2ef7e0140a4c25e890781230 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6e5ac20b25200a40dca080d2ca3bb2f10dfd1e4a5703cc3726bb29531479d5e +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3667571de9f5ca9f7129d7fbae56a6b256bcc295 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6872f8ca3dfa8c9b24a43b81c78a0e1468819616b964dd5eb7d35f1518ceadf +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7aa43f3aaf3bfbd857cb5752a696d26f1fd71921 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c4ffeea4c97696a6f8e1664ae850aa0211eef727ff1c46364b85f983ab64ff1 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5360e0abc5ce97bf9be55b897d4a0d1cf98f9446 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63761ad56f9b1e38f8a47a6451befbc3fadd67ca0a8ef54ab681031b2035142 +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f82724ff187a8579cc64024ba65585cf9d007ef4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b8bb3b7c749220a4377f22b40eae7a47a33be3ba121bf7996be3a15299166ce +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a55ec62827b4f0b24713ba85c8a3ce99d33d7c1f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:750f3b4320c4aaaed2a6bdad904e62408a0feb77e1985b593c0f9d69a9bc4b1d +size 208731309 diff --git a/global_step5494/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f505abe6cb0da8820521f744e4eeddec8d120eea --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08fae78eb42d8da465c8f7ccd978b8ce05b05bc2c75fd12d81d9bb1aa95295e3 +size 208731309 diff --git a/global_step5494/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0122fc113db738776e106dd4ffa416ea70b69922 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ae8e662aec453a87d9de65cd701d25654ef68edd984aa334ff2168543cf7925 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e80976d75eaec07dcb4967c2d5982d451110bf82 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d22951a6b7d35f01208285fb7f99091cfa74555f8e76ec5526113f3966c6fc6 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc38feecae6be08e70968e30c40ec02a32c75e9c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53047fe3923f7dbfd820faebe2b7af9be48c698ede3750243889cd185d9d00f3 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7eabcf4a6f6aece04dcc9b6f14b65b2c1d6eb813 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e68a31612926b4c10a29abe343989a16f27db0f300d0268b00aa29d6a33c780 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b16f7992274b49489fdaeba0996cf104101970e4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6cc562c39f984089cffc62d1f80c406f0e0f91dc39ad1cae1d19a6dd4a8cea1 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76c5746310b1c5909572392af89ee1969841adf8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:351f734b9d55f51bc62b5bdd46b4985e4340443f0fe17a2ee7feff02e5f5f757 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f461241020c9f9e4242f9e2554464225c41d4a90 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3970be5f6c6ede15f282be7d03614000342ac79b0c0a11520f3373b19dce7a25 +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c76c2c70af877f49d5c42fbf01b3bea9e6585c05 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6ed8106e2177e5b7cd527ecff166e45d296aa936355f841eec7c3633083bc1d +size 208731501 diff --git a/global_step5494/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..210a40b9f9b5db35007b66decb3d365266ab9201 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c6907c785efeeb5e1cf3ae8b5bbe3eb35e6081c567ddf5cbafe10ff0714434d +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..94880dc5e58b5739ace2bb65ec36c47d8abec4f5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9314560abd6f4aa3c83642185bcbddc2649c7a09c08031c637431f6746dfadbb +size 208732205 diff --git a/global_step5494/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bcfd2d89b680de1563e14a08437275e908eb2a7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f412ca3a9608378bcf9d7cbf188f985b150d52ef0488f30f2a976de4acfbe83d +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf75b5074f05b5c02c205328406eb3f87db6a7ae --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ad44d343ab244fefc76ee0c919da30b611c8730f597f595e17c645f78cb6f9 +size 208731437 diff --git a/global_step5494/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e9bce90c6497f30c6d43666ef00a25c7a0c7b00 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5620b66a5eb00e273d85c2cac6ffb18ed55d8807d7d5d26c647f60b462b5e495 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16dd69e0d98c1fedaebf1c8d545db8ff73db7066 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3206396a5f767fef11a9afdc90c63590c680a43e6905145ccd81b32803e6a32 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..becbdf18c0bb996f06fef758fce2883d2389cf41 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59efb022ffbfa03b97b608133b6ef05281ffe7071c472bbfce564c310e8f2cd2 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6d49f07377697b47a8978095807c13fbba8b192 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bf904088637350d912b1671ee1d6f714b4429cf7d0f79ae1b48bf27185b6d56 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8162209b452481e8370307feead22859d33fdf3 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:806180b86223883dbf1f8b436ce4ece74b1f8524eb9ea39523d103f9f71e4f29 +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c4339253962e4753857b59d25069bebbef3cc64 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48918c1cc6dce04df1ab7ae21ef8fe4b474c37be598bab3f5bf9a7886879a43a +size 208732141 diff --git a/global_step5494/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30b8c20741952b6c3abae2fd76a283db76b21119 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c4f13a658510774bd62672f729e6c9bf1a68f5ae7196d0d6cd65210954bc96 +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f38aa25213cdca3a9559689cb3de22a1cea885b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d50a5db6baae1472327fe766fac847791e8dbf032ef1d1b91ff6f7c8a3bcb1c +size 208731373 diff --git a/global_step5494/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..622b1b18477cf329c3004e7706074a197e5cf169 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1c6af16757d01765bf4733f747d85142d2a34631a3e1caae9d10f5fc5789775 +size 208732077 diff --git a/global_step5494/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04e8aa41e37ddd06e9501ff8bdcb4b1a1d965c0e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b65f44788c608127eaacfd3e41c7b937517ad6b95f1800316aebbfb87bfadff9 +size 208732077 diff --git a/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a6ce60708060d80d629ec5aa7e2cffd48cedf78 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e50b60b4a322d2e6beb75e28553bbdbe79dff4787d881606c14b5cc548486098 +size 208731554 diff --git a/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bc8733b68bf3bb25918d7006b2029b0670ca157 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b6a0b0123044e6d5d4b126f2412f4440fdfcc1aee96147737390942db2760c0 +size 208731554 diff --git a/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f80de7d12843dd40c75a3774ad59653b649ffce8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1bcc5992dd99374c0e0e65531a84697ebdf1961db66af5309a3465262eb6f72 +size 208732322 diff --git a/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac96d01aedfcdfa740d5d0c80164b45c7b5eb6ec --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:160b7a662456b4a149151421453b6152809eb5644d42f63060b079fa2939c8ff +size 208732322 diff --git a/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..47aa7ceccab925c035ad1e27c175608a3f5d52e6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b29e52d19dc22ef0e361a320cecc9f908a9520ecde5e76d4eb97919346e9836d +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08f5abe29fcc27d33c6fbdb6b9404c8e3b434e63 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f46c8bb6b87fa0185cfb3545a32b639ed273bff1643acab588a034c566bd62f +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2771279c3ed7416d172ebf567378da3a930980e0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f060765729c79c834e8695b0392579d9f25b849857ab18c3ecd27b2353cd6bbb +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f92749f0c7f4192409595a0891a607b20c2d849 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad658bf95b7c2f1983ef9bd4df92f40ae12026e1bd6847d5615a07d680e67aa7 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdbb74f77a6ffaf9b03fc27dfa051def194094ce --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4077cca67b75df2834361c1a687e0a3b3b43ce2c5fca115603f5e11dd338f58d +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d95917e56e4320778588194222aa1d73027a5d9c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b7144f52b4e179cd0fa7a1b1b001af802d7b0596f1c432ac25d10056f5b578d +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2046c10036e88c0a74f3664ca0b0bf726163fb7a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29e171d9be064fec7d2131b9dc0a5f1c437b46b376ec2b36866b3cd08483a173 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..586eb55cccb061118c46cc3f8fb6595590a9ddd8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4d38117844ad5df8839d25a504398daf03cfddcd29b7c1c57b6345fae386bff +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f35bf3e6610ce850ce650487c97fb4923ecf8c08 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2494c79d0303c39ebbc6b4fe589fafa130287b508e52fb7495f6771edb78250f +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02a7d4c32a3bb44e2511e0aba9a488d34177c900 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:115733dab96a16daa96f07bb3cdf6889c374d1b2da3e824cca53615c47c59b49 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffd588eb88942a5ed1ddf8a3654cd9b44943eeee --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd6afe945239e2a817819c1f3e86e646e0b786a326339f871ba5173d33ae9449 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b716dde7ebae08e2b2931e78f4cfd30c5648a0ab --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8935adf30f09f0136845d97c6aa71bd9be7d7d6240e922efe48e2581689841b3 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0907fd3ecf359097d5029cf783ea83557b880a1c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c580297a58640c205fd0cbb8130d209394f3884034eb5f5976b304da28010ac7 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfc6a1368a6dcd65648477dd824f82ecdce652ed --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e226e5161e0698e28a8f03037b0a65003c70312e4f8fa0fcc4ae6cd5398ad9 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2043348c871cd15b8fe331c0c548ee537e8e2a5b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e17623438e0b87b67544de70692482305b8d510744e1e8077eb6bee8d78cbff +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8549d537a28da791f4234aac8900f35499790ff0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba0677cf517e69dffbb15b563bb7d68cd1c40f3cc4fbca7753f45308177cecb7 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a72ef98f154b9774998c95d9230c77b22cd9263 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae296a43345ffd71084d1b568cc1c2691733fc50b94876c9cd94ce335b45bdf1 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b255fa3cbfa2f2a4f29234c1d8bf397c8b7d66e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27aba24d07879f198d8f09eb2eb00a366c8da98b173795fbb61a3b47060f34af +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..410ac105d33131108eddaae727c8e06d08e844f6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d12a3c0c47d18632e15ad990f1054c4f8911ee8ff517637dabe97961ce4b2750 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c543afd1760ee0bb235bab93b9b1c3eb328e3c2e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b612538d9089ece13c3e2c28834e76713f393ab0f9a5ccad6988a7d7a4e533a9 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4b75ebc7c3618e3420f87278865bdc0bdabfd7c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c269bf67f35f817317cda7418328a468b4a0465f9e0853bdd6ad69ff027e14 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5a12a411bb8c6f12761a6ae2c5b4ef96c187779 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4320959b7e98f7224db1480f0c204dbdaf56d66cd9620930c6aab48aa26e5e2b +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fb37057b3ba3234e2d3ecae22de6075c26857e2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8434e3f84d2d3d18ddb74ff810103b62a4c6272956c6524ecf81a16fbf3ed710 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe4dc898d59b31afe1361ad3c2947942504eacfe --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94544a60880c4d701cdd42566edf5b481aa6364381fc682c899373d9c642d208 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e945722ad21cd68ea982c317a33413ae3e10be6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7640b9dc043d60833a05628dea51549b5b0279b2a264fff2fbeac0a2869b00d3 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10d37f4f1eb0bb8001ca8b5bf13cd7f7cade3392 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c798f2c9d1ff8d0f352eb219f17a59a254e30c72a71c8805cf1f231195229e0 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b1d56896529931de2755a889a1945016db8390f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f06058223c3e7fbdff24e34f6014d82ca56657b538aec2fb77103893bf9447a +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96a2b13015c75783d48af9f0d5727bde3166b83d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ef932f8c7d67b720ab81ba350fef5e0e2163ff9ee207ae8609d888bb7b2a191 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a210a7bf6cea0914c2c751266b21d9a3a971b28 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92327f656b361bbf01425e9086b26ed9cf45eaca9d2cc418ffe4cc05cf186421 +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0341eb61bf5a1a684b023bf7173ba2ea25762f5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7d065c9aae0438a6762ff6ae802ce1f9c7a4cf29ea39bf92fae9fbe1c71a42a +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a599a7c0b096c501e6aae5791d1a915ab1ace45c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2969724a5f3ce5e749f6c184ccbf3e8187e1a7c476a73326e7dc8e13d2bd7671 +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38d98942e6e688795e9f4e46b8fb970184136251 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0784faa1a25bc2a7fda57bcddc72463ff846474fc60806a01cca385c1df0fe9a +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0fbe93c85488943e22077cc3df73a32918a110d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ef0094c7bff5ed33b1040a262dddae3a8868904c2f231b4cfad619e54b72b8 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d4b6067eaa800c5db3493967df31c993bbc9f46 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc9cde416accd9a370396761a6d841e601c28999ef9c25be5bc584abb1613c1 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf778dd2ed9d0ae5d99659736c4812fbec83ac04 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c4059ee8d5e593ae642f24e8560d2fa7c57b8c208c0c1e584dc8c9bb6a741ea +size 208732322 diff --git a/global_step5494/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9c3d8e5b063757e8dd597fc763bd23df919a87c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54948549cbec5b8ebb3d0e9c5835e09812740d887017fe70213ce7748cf509ce +size 208732322 diff --git a/global_step5494/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfcc46d4350067e4386cffd8fc05043561d305e3 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e27e02d596e9ce14b15ff6ccf0471feeac1a243b0245af765aaceea4f9bb2c1 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7770c13cfae289ecb111b205da114ced80da0ec --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3975cc6f9e2056140993356b025b2e458573e5619fd1173c8fb06acde0706e02 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9beda052bb463fd14c91ebad2f53d580e530a612 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eda61d1b56e2f37af5e30bd640e4c0b78c2e62f98e97a966f15f418c02990d3 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35d79b5a52baeddfd1db6a075ae53ea9897f6d53 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48f9c41ab10fb63a2d4c68245ae401df4a81b4344274274fcad856ecb6cf1a38 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..afc39520af978fd7dc38da4cecf2eed7be69d4a3 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84ed3292d6b9886c95ebb7b88902886b578ecf0befcf762de466d874b797d94e +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66107f86c5387982076eec00467771ca7785ba9a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6fcabadb3a8d974731d76578f1c8bb2238555c235a1a40eebadc85e17074345 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f938412c033f7d2465439d9bc5841cb67e3c347 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ea7cbf2f8fec927e0f22aa25331e5b371d1b071124c18691cb29b5204378737 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46bbe30d539e78b94bd06e5ba1fc58d4705ff11c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5101350a1975effa22d54cf152f706afdaf593d84e7925be78df5052d0abffe3 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d43d377d501b0231670ff1e1128c32adc7a81e50 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75552164902d7525a42f5d55f238d202a4786ee6df64ed641c359fa9266c6a44 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ebf815b0d6e3ef9ea333b95ec59a6af7d1a76ca --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5b0456932e7f0e0b7adb165af827f5a8455adc14651aca378fd488dae88df5b +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc7feb86a91ef5a1efcdd7118853a1e2117074ff --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceb18a7cac38473c724f5dd9ead4d02ee9972733ab4a0f645257c7b98e967d64 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..724763fa41690428551de0ac247212e810909460 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7493ce51f386c1a9f4addc59743701128210b2cc901d34fd8fcd9aab35b2c4aa +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b4d1229e4718f969ce2fecbeaa112cc72cd2d43 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd0224b8e77341751ba9040591e0a8ce85340a8cc0cb9a6f2e5f7c2854d40efc +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8389f445ac512c9c3d8c5b2b4d24ed16239329f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9767e671a4007f12b66706ac50d105e35d9f7c893c76a424e90d98f8871c2141 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56e124c0ac3d6e7eee4922bf56bde4ea09e7a2dc --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73a2c71166f1b61cd0c2678951db51046e78953194f35dab274393d0c126a432 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ce6ac297ff42bfd3e449cb938aa4378882a6dc4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b1ab24e3ee6f61959a05ca854436b6e45255ab27aacfe5bcb1306f07b56b3d5 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d75bdef2b05e4962a20cf81fc31c3f6c3d8a312 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12818928dc4fc10bd88da8a0afe70887399635c65978284b3dc6681110cda855 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8345970a32fa8f23d0a48cff50a4bffc05779e28 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7145a2e3f6a3f8892755d14dbac0ff6c3760ba0ec7f0e819ddb69a84b518204 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27d574c82c6d9966a5b840045b09c304d10e4a57 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5acb01cf5b3b15ee15eb879362b9a0c8ea4bd33b16a66c8c226c88235d89ba93 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d43bdff6102dd6497f3d70e6c9a872005de903d0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a56df0deab3efb468d51c5db190178d5995ae545936173a96f5f620ac7e939cf +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3438d70e5b0e7534c6f161c3901af3c5ef40262d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfd8291abfa8a76845cad5ce5792975784a1fef231bf794fd465e05fdc37ee74 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64796494092a258575052193d7be2cdc36337ccb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db3b4c9783b5514be9ad23c1c2c3c2ecf53155ebb04f245079aafe87dfe0bdd2 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f274f3d9510c38e8d943cd3af1a62ce5770e7fe --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1ed9e9bcdb444a03ce7b61b07bb7c38d4cc3835f98bab088814859fa67a9a9a +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18a5806f35ffbb4534af6dfb7c3c88afa96e5535 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4db28a6aaa14b907472f187a810a034f97e7cd55079d38c8d37bfc8e6909e9a5 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f757f98a166eb6172b901e41b1274a81dabd9486 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:432fd05fddc6fc424c7d1778219b7b83f5471798b2c36c4435c4e368c9f3713b +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d56089a96dcf0e0bb46a31665d0ddff7620e411b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d4caff4360bb4723abc3faf28e377f5a2f0b8a3eacc5474b3faf393e42a8d8 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70534ff573b4ac7afa14ac5c8a1faed75ed314bf --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26fcc5f81951517ed3c64e9196eedf00895efc7371efb7395ad931698440b2fe +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa0fdcfe16d79d76cda7f7e9d1d6c5cb0fb52c7e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f8c6b98aec08c4f88afe2fe38b1e2f76e212f9e8cb3fbef5e0136c91eccf760 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d03e43bb32790521dc1aa61968ee093a24cf5505 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14907ccce621a720f7bbe0575440e46de2721292e201fe787847acfbd23d0c40 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0aed7b640991a606d7d50d6d8d95e84690ce1144 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17214d02a64815acb846e6e58f3fd55b17cab1e965d2bd7d32d2d02a71c61c94 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..797f1954c45b6fb332b1045af7e2e017f4fff012 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b30d896030176019ad1b120fc2628d9228f4c7e9b687bd11ee43f8ebea36f1f0 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be5b63732ee2d0ebbd22c019871b65b3a7ed6c50 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f17bfd3f77fa401e6044fe38f31061298d5556394a7bb963cc3a18703360669 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10471f128f529b53d732afefb496c74f81a24464 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72defe9ec85087c3eb9d34956cf4afc97f604cd08250aa101c8c9f41ebd51991 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f0ca244cb097911282938cd6b01487063e1affe --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dbf7e8f7e4d0178b0dafa713656f54e084642659f752f8be86141427d72db0c +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b28d794ebd86b6729fa9dad36282a73ab17db5d7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a6b3578647c78435ab944a24a0a4e16fee10ba4750d6692ece978d60f90b74f +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa8a2e691fedcfe75d2b82c5dfee26d9d1611e6b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b7f20d71296b0682de2cdd9e81d24120d54ff6772a218aba79d8301aa17585e +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e1fce2b539d4bea4dc4462841748b7766131e45 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3731c02910d2a4a1ab673308b4b504d57b2f564e3773a9b033af9e988d2328 +size 208731415 diff --git a/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74f10ba4b8a6a32f361656115af97ba067fee617 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74d8370ce4fcbeedbf8fa83a4170a953f0e00b9221b830b3f7cf485fbcc2d017 +size 208731415 diff --git a/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ca769a73ebe744592564d79e66143d128b7ac55 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c6321a3bb69facd4befed9eba0c8329db5ce9333b69068becbd055540f5a7c +size 208732183 diff --git a/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16a0a63e6c3a53a625b5079bd0eab48f9c9b9db6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea20bbceb51d872c43e54f1d67e10d94bff8c9e8a02c9583b4ef14e42e8d6766 +size 208732183 diff --git a/global_step5494/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a8a9d454ef14a6c651fbdb2c36be0174ca91aaf --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:768b326044c775dac174aaf03a191ab3365816227fb2117f4546729d85ad6169 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..891835b079a0411d520acc8df41d799742d1b2d7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8385ac4c9ec0c0d8333b23fb0541dfa13abd72e6f706873e609434e1658ac1cf +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87e1307ea208d482232318b74ca5d1312e9cd9fd --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d365fd981b707bb7de927e341409ce61056048a74b65df92da2a35489a6c2b +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f7b72471d56621f4fd1594d13c4d1d68623a723 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad6736243c51b0359df5c7d1879330ccde084cf8ddb768aa6fcaf8719465594c +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73f06120e8dba21d3d5f8a3f88492bd264b8d8d3 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:076d9714ed50896cc63a450f131f55970253d2bbad4dc9fc34af247919ccf54b +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d86fa1acd144677db0e35bc4aee4db9296ddcefa --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:753cc7dddc675b685811d74fe9fa001dbb83d515fbb3d5770acfd7012987f100 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..178f7e8511dbfc3cbdba15ac1c3894e5e3ca0956 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee2740659741aa14b03b979355c3d4fcdd602970c102fe7e157c98d38979ea1f +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b691719f893c463b998604d769b2c3cbfd809fd2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad463a9aaa37d3a163755757fd0964351f4835c652c53c04d84ef06f1616f4ba +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cc5fa0d1719fb601c4823aef9496b1fcc43e43b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e682f932ba5a109c554b808b1f51120dbfa1ca179044338f243a4a282c580e5 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46643c73cdc947fdf1fecb20c243e0e14dd97b74 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afc457e5e116fd365bbe9e3338f9ca340f80e2a4910f580782fdf9308ad37077 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1eaaf293415dd3c8c37520a46b6ae9feb8b08360 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e189eb2518bb04891ce39aa020a1523ac86143bcfafcc179c0c936f5f4de76ac +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9459eca6e743ac4338a9a3a9e4e3d0120d1d887 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edb13c2ed0870c1f5711647a4151faaf326f8dec076740e48e11e71956c698be +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77d7df7611f4ae32289dc641ad098fef939e6d95 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81ca88a66940e37fc35df81e42c35f9515b6b53f95ff8896fcd6deca56816fd4 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8595d7c40427662f5cf8c26877db594d0a479976 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18736753bba2989172386047d10ba2be6099519364c02071d4f13c8ab94637e2 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaf55b2ec275723b0405639d21dba2b0abc8e6b7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eb9e267bedcd8bced5f2f59e251ad308f021fae374751d636b96e9b9b13fe69 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53ff3158197adf68660cb3494874a1d89ebf7db2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a689de78354e5fae1145e814b9a20ec777e9e8b6357db9822e4676523abfadd9 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a85ad07b31eea07928ed81cc1f516a16f380377 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6288d9ae462f05691b8976dafe99155331109e3e64184cca5357e181974bc447 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1479e209f33978379c59c1f373d2a229c2d56882 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58a3ad95bbae139197c8b8a26280d93943376850151efc16981b000fe79b1679 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac46024e11deba9dde9ad9a547d90666810d7e08 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a74219ae1fb49fd179b1df24afee6127253a5c2ab9dfec53bcaa17ad13c2ad2 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddb410d59e503031c7b86de66e68aa11ac2ddef1 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c33ac444d769b12aeb644bdad18e6bd30616153b9246dc91656ec2aec867746c +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9f986d476119bdbe5996e2aeec1b6e1e7be37ae --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa57f45ecb7ad25393c1c56bc494d2556364c45a3db63a15969f99993a69b053 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9370d058a477d97cca75ca922d011b3187e6a28 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0041b112d6e6c1d782765944811c4b7dd8e6c9c42b138df8b300928231699e5 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..000fecb81d87d138e32664d91721630170b58d2a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0165d2110c8595f58a2b7e8e33eb47df4fbcece6c21e36298570b0aa33195d3 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ac809ea9db61dcc693d714a6b418e1fbf105815 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5915421c444dd2e2bb70b33a4e466286748ea8cfa3cbead347b8175eaa746422 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f6658dbdf707821399080e80fad64e5f9f3167e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c8adaac36df2faea88014fb475d5ab81dc5105966981b07f1fe3e93e0f220e0 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9d83c444d35efa19303a12a688a100ee302a2ff --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:838e490f1c0699fb1b202f898c1106a37ceaa699e5a821a4fa1cc0b8fd342865 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a360ae831abce1a44d6e80b06e78c7d118b14953 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8414d36209e882de17d65939b2a937bcd291afc1b31d8b5a39ddfd0e3a015d43 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac3891936125d8ceb98a1349fc1eef34c738a235 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4e819cdc9223d1109030ff147209e22c57e02ec1ea4e522f798829533461dd5 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd059aae87058df460a50d61849124d0cd321251 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96e1b369f57ba41312ef6c8614095798df24aab0120bedfa0dde59d34aa20dad +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8f3a5d89b37f955c7bd9086f8aac124a5610488 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6776d64bca7f52d3b2d62e64960b6f2ab515c1124ad73029bff61c0613bf04f3 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfab9e8a0e6468cc4518009f6f91593e0de41dbe --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff8ae38fe1ba339ffa9d83d68ca8067ea0a58f27537d1b01b8936534ef3459ce +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9fc62725991155ce7341473cf880721bea2240e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc2d15f15b84f5082d3093e3c64efc67ad476e2bccb2df3d614fbd7c602de1a +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bfecf61e1898355731208efc893b3ec42993333 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe3e0f84bcadb53099c083251e4d117a70da52fcceec8a91e87272621b63e507 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a9bf7d540d156351cd3364ae7591c1d9b060428 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c71dbd44e665e7295190b61e32eb806fd3e5394d5c3ebe392ecf3844e25e94c +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c00a541fcd20965854d3606603195ac3aea40e5e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48ed952eb9bb953257416298d332645de243df6f085487952328f8a98785d809 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c75e10684971b47cc5598d0ac683b4bef6a7fa95 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dea7d364d7fe3f05b6b9c43ce922a43c2ca774026938e4129ea2ee85d7554b04 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e2d39b3c323515f1207953a1c808ff83c856567 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3892a16a4369790e53571e612b13cd1a0900a4ed73e7f8d7665b60229d0c1dd5 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b40b1dfd41c44c356d4d27a239ea02eec6e2250 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cefdab48454723f333ab60f5c91fa676c6f9c208b79b098bf86d5cf7f4cfebcd +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..644985f38583da311333f7a4b942ca1f17248c9b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c337c2b3c9e4cd08d331168dea60d19f2ed60cfc3343cea3db9b492f7ef806c4 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f97bb573bff1a49de97e27d2bb4542c87a0ce21 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b24d3d6050e1eb5d7608f0d4f85e5bd449583f6e6ccbdddbcf1823c4590485b1 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..544484300567fb46489f2c4fb47ef0a8898864b3 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3acc544748a78641dcb5dd3579de0ad5446b7deb94b4b51aa5c995319cc3fa6a +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9deb0510f56703626dc06471c840c10b6f942602 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3716b88faa6465a6778a5e07b41777e83d99326ca6a9718a4a940c2827e3ad4e +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..664db26bd0f95a8122d273a916b64b104b6ebc12 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00701e3afd5d27742751927ae91a273befb7eca1d74eb3a0536e4a570e54cf50 +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..363b5a2caece192491eb8f025327bf5385f65704 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e6cf0d4fc769054cbe040caafd13667314fddfa56c9c9d08bcb458c692bc77b +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9aeea28370e763721947c44dc8ac9d3843a648f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca34fdbfde9c355e2c985fbed347e2eae1fb8777d153c86dcf9d1705101deae2 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03f3d9b7f1169a9b65a9ad5000e1829cfe4a0a7a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc8ea8de687f679baf91d939a3f0c392ecee250c2e26385f9a54910c01fb843 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96bf44ad162622d83c2db6f925f0c49988c6b2cf --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70afecc9de55640314ef03b753128ab00492c45ae09e644c4c550f0e3a682e51 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6abd04761a5f5c2f633315cb2d5c308bde1efcc5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9362a25bd34d67baa6053c16cbd1dd546927007498ddf17ccd9bc2d76b2c0e73 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f76f72ae10ce69ba71968752d2597e2694676e7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:697738e1453c8bc1b1b9b653329f0c460dd31f31c8a4579addd0c4eb581fdfd7 +size 208731234 diff --git a/global_step5494/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7827edd5302d36d10ba6f570f14c89bd346bb7e1 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c4d58554acc9193b324daab1bbaee0ecf990bc6454291880fe2b31bd0da42a +size 208731234 diff --git a/global_step5494/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecbbd1ecf24f6878813a74deab00258daa3f1b95 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee8b77afcc23e31dbf51274204eca1f9a6209345d3cb6858fd3f97771ff6cca1 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6bf7deef7201c40dc64020ebbee1afbdfe55e85 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c9ff2f90bcf7d26d8869cd01037b3dd97ff4bba59f0ed980345554c18a9c759 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..152e8933b2ef7c0fb7f178bb0ba375f0051701a6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606b50c9e660a0eaa036d8cddf03f95089bece4937975396aa634e1bafd9ab88 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd68fb8d7b2ac93b1f2989c6bcf30d8443228a27 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b32f27f80f9bb73750ec5fa6913c19226ae855dfb1b733a48aefaa71ac9c4d +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..651c1a699829b61fe9213b5d55709f603b2935c0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fae985d04c1c449fa67e36899a0bcd175e795eb33e232411fdea5bc5c473531 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..385612efccfed9f099518e2a414d93c94e9cf2bd --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f10221b1260799be8e3c8e2264a566bcd2aeabdf5c2f1f8a9f7a17d7496e292e +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b88858ad0794f63a90c34f6e0d1ed499fe89b699 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b4c4fe2af809f99836d1780e7b9bb360338f667a257bb950723339e0b3c6f1 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c538ecc6138291ad339d5969466d1b62b62c9858 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bae1fab05a92b18f5cb0abeec4bef7be112152ad74d30c3924002faf5f567f0 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..272480aa8830d21dfe539021df8ddfdb347d731b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:020094e37cd8fab393e36d54119891b090ddcd6f7813011020328cd26473dc7b +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd1ea741845076813f5cee37af17af823f50ad26 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a598df303cecb6ff031e5c0e1a4b3baec36ef4515161785c027064bb47d1372 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38ad13fbd0f52a7bdd9454c5cf9c5cb81e24524c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c450d16a5fa0e87196c42aa1cd43131430ac513bf6134d9831eb3697b3e2a766 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76423e48d89a07b86f918212bf944e4e3ce03f3a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8681bdbe7e9c426ba6c866c5b9f1c493d52ec19710533228eb9be5b1a90dda21 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c424156e341d23c859a64eb24642c649c77f3db --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61d7d9335b6c048161a4cae81da5cd94d30d68cb0661b34fbddf6925e8264837 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..206a359670cf4d9d0b599a47578152d5e3b81fcd --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b24c39bb066b2f96b086970df7bf3e904b5f9f20c537176c7176121c1966f4e1 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70a9d68d65705bec337ed5f4a1c21305b76f546c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa276f731e96487d31fa6dfd0608e4e226d938b1b89e0cded20dfea0af05e008 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..915f15aa16af6d66abbce3a2eb8a9964d42a48ba --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8604a345096301d85362bab2825f12a4976f5ffe0553cebfc0477f79cee883d +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3fb355a8b5716adf2a9df124b2fc5733f48a81fa --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45d992307ddeb8ae47dbbc8c282a3744fa2fbbb48d206639180b1bfc34b1b17f +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bec0543adebf587452f4f857976e517ece93cf7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:278413e7f363c473e6defee3b2ca97fa7ae9c832fd038d318bf9c57e1de57393 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6a98716a5f07d4e0205904f365105a4848488e1 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51efabfeb7cd6c7134e487bc4a96472ec281d79eadc669cf0a08b5e5935e0aae +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d0b084be5998e7e95614074fcbce2d356e422fe --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8b39ec1ad80f7d1efb13c6604a035d5cffbebe38c395fee3ce93d38932d4337 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c008fc8676cec06ead3ac320c5855bee129ed66 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1905854cae6b9c567d1228b6d92d1942933db4a7bb75dcb06e5bf9f0bff7d5e +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b371bd16e9f7e50b887199c5cf84921b5a1045ca --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddaa1f3b2d6b0cd609f84f1fab8ea88386e353ea405aa2ef948c4c204d969ea7 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a83e6854c08ebfb730eadc46e1c72b85905b96b8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:654c6b3c917aef5561444987a0aa96a32c98e2ce4d794db079aaaf3404f433f1 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9df73865a4bbef653db73a132c24bac72fad73c8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9593ff245f785e0e1f0d29e4bc4b73452b10fed8ede3789fe0444a5d51869e9 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f19eb4a2c37d6c908a970d10db3ae5f4c7afa43 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb5a54f6b38eb4cbe5fdf3802856825d3587303479f9dda51f999df7085b54e5 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb25cea152121d74ab512ffcfffaf071a66a99ab --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82c384c8dbc8376a7f7577f6839488f0135b9716a6fb46180025b9aac92e45da +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd2c35ec486062401c8a6029c689142e6f9da87f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97401e08dca5e07598ebbe82eb8136bfd68ad90dd9539ca6be4abd37a3980187 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..579a47d07a29bb252ee0dbfddf3e7dcb93c3d7be --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5f35e27e6301df9a8a1dc1123d0dbb95437f6fb234e325cc700575acf373fe3 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7803dfd54cf2c47199061c0d6ed93fb1970c0fa2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fbd62414fea7a496fed3ed4dafd248063d123a72e517e5629197141a886749c +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4067c1bf4cccd0dc585413c9820930a5f59e0a0a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0c727197348c489eb726f07904422b8fadcae73531817e60070bb8df3824ba +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15e63de1a85cb2f65fa1e804a34f793f29f96a84 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d43bf4010e5bf5ba84efabf68bd0f752f4f296fa23ea6562a1dc9a0e00548e1 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4313a41e83d637ac04c517483f2e243c8620cfb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d88761c775d1717b59432bec07042fda18ea4f9cd9a0e6406f8462e29e810e2a +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a0bc7d55f9626c3ead3baab32899d93ed7ba4ab --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d339c765a381f496db9b4aeedc92793d301b24107338d84b91998ad51de443c3 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ceff1ccf599b88190c63b78e7a6d5b4076e38e6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7750996bfa414bc9ac676921e9c67bae994b6a72288680079caa7ae195ceb570 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9dac7840071d6cc6259e65319c03b53b0105802a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f47623ae67c970429ef8860b65a6fb49537889e11ec30a0945513d5b6b036c2 +size 208731415 diff --git a/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c020d7ce11ad99145c0e0690989969c70ef1187e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fea1e8b3d6241b80f3c47656738a427bd6ab91e6030d983cb571401246dd9b7 +size 208731415 diff --git a/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a61f4e63ffa0d2668f1e9266486ec3d6c75d13b6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e33f62b3871b757ee3cd0d5180808b6bfb547cfbcc358c66174d5c3f4906332 +size 208732183 diff --git a/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13ba2be96d2af69d6281dd5dc4914a926412e773 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:177be61494fe46d6bfded6e94317c121504cde0a217bed95e8ee0e1cd90f4b4f +size 208732183 diff --git a/global_step5494/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6b2d30789945d59227761d1ccd346b070648950 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0b03c252416c5e0799845dabc8829e99ae74fa17297d5ed8904f1c9a5a5a73e +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..922ee8909021d6d3eb48108f77614b8aebd7ee22 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:839274ecdb19921856e6ec9c3f79e1f901833d45cb99b17e607b959b4266baf9 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cfa75d1858dfe0f6bf61834d9fd3cbf7b00f1c5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa66a58f0aeed54cf633fb0e9f209e42aabdddfccf2be3890089015934507d84 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..865ab74dfbce56b2cd7663d7b8d0319ad894a129 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50638addf0732f7415e19b89558eb1a353a93729121c4671d8a96ed58a65f98b +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30a09f3ccb2795e0017c8f8aa23d83a8cdec6ab5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90135fe1fcd28eed83b9af15e9490aa5e3bc87215d21fef22b4ebe4e3bf342e4 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c809c1f6bd0caf71bec3bf27905f7858a85c1bd3 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e289cbf427f6889ce9dcc602abd9dc599a01f06fc4efa89f4f7115f329165eec +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28bbae77fa05b69661f9c0b8a95f374816feabb0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4064b71aec54be5030ffbf58f8ad207bd1ba943c50dc325edc9dac777ce4b9 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfd63df76749d34d575d8ef167b210364a752b2f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:123a5715f24289cfa87d1ee6f059c3d60624364186bfb8885c6e1ade3fd7aa64 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..939695365ca683195adee620ffc8100f3f249364 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d47d9bd27fe8abb9b84d45ba7fc3885aa6762a4052399e1716ccf55b2a9f3aec +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f22fa21cd6a188f72b9078be305da066ee698657 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a55dfe3b869141942492af0b7624ba982720a2dd818515c99405d128aedcd430 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..300e6f91546a4864be3bcbd55cde3fa787c6c12f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67d7f5313f6675d90c0a3d48bad24aa851acd1d3ba546eaaf3a22fcb01070e4d +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9193110b10fabb5d5e1cc3abb2435da25336443 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16762f1ca62664863854824027f029460de5bd0f84878c1c4e26fe715e36fe6b +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc084ae944f2050949956df56ff6202624bab92 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d66c31a288d51741b95d5f50ce7d313a4c3bbfde6f3e5b3ff9a17ee83b442d3 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c9854c42468a3ae0cbfad76c31352789eef4081 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9673f72c52c63b9dc8179708d380b7bf08b940d751fd30c52465e507ac59cb02 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a09366142372fc6ae8ddc749a41361d9e9ce79c5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af524d3e6c47b48e9904acb968271b4355e21c1a18c2c981613aaa076fa15ecf +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a13af577e8cbb64d036f899df8bac8dd0bd6cd7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ba62184970b1dee751345d2e71230fb57cc3139a0990fe82651abafaecea852 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d18be262422a37e437303f730ef58c35408bfa2a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a2bf7049ef9668c4d9457cd3aca212a852613bc6d0bef182a281ac3ea0574fa +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6fa44e61f914f019b003bca8aaa2d05a627fd0e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f85d7d8ffeea94814ff7cb38f59f3e8a0979401846602a7f9a95fc0edd2fc9a +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e93d175c9dd02cf4113cae44ca4fa3a110196702 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef1c448bc0adda209006928f8f50c20507f6d03f5804cd2ec5fee7fcc46d268 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..063017fd15ee991ff7e471ba750442b3988ff1b8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:680cc9c73bc457e2807518f5c3dad13eeb31dc0e3e74dadae5a29f45f88ae8b2 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0bed3026bcb13b19ed540cac20201c5c9db683f0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cfb5e2c64f651b304fb75ce8f74b2e2833b5086117aa003c45fa0b47b712d80 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d55144990317609c23be147c3c16ed5b7c895ba --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:921832ad87c28569a6ffb7b01d2ca26d88d0d3115e8b64e48a07e35c9db9d6fa +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..691fc7f5dc488f6c592472739b2ad6f50283d9c4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ec553a44e177744c8d912745820607d46fc09eb2a0e779a20418011a678dd88 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..718f3d4ad924f9ed62997d18df4642b9aceffcb5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b1d371653c75b0fffab8d07f2c9809db95a21c1dff454b1425c2bef985fa608 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b9f2252cfcea7f17d9d4d69c18184b079cf27e5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99d4bcb85ab46eb3b5db1d96de264ba0a25f43ff55ff52ea9b2c33abaac2a934 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..849e16d1eb925af36afe48927279ea5b84c8c9c3 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cd4a96bfa9b4a914b029e9ebacbcd7437d3f9024143fd27976bf627e38cc5e0 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d67affeabe11ca8e696d70d28823f3999fbc30a4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b3d0700ec246c2ca6538c43d90630654797833f93da03d7a010ea80f4a0918b +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7c00ac9f050fcd8c9fd497026960e87495107d2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0592892b873e8c3bc8c158371be90332f0a3a15425c6953a2faa3896bf6f8fc8 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c4d1e3c302dc66ddcf77d0e6328dd0cd92952bc --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc72a27754ea155c4ee4c31dfda737ae697a0603e0467688ded893f23cf65da0 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d9647e31be6943caf1eaed0769145f534f81e32 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4adc3d3a1258b73ac7ca1a11c3bc74a8b4a13686f799fc042e93b527d47a828b +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28599b2c53da57dce13e9f757bc52cdd137a0c98 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b06b2bdf5215e93a497e4aedf34cf945f8dc04146a6c49b7636ebe7d76dc8a7d +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b41e1eea394071ac1857fee19a033001709b7123 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8a590ae7caaa471d7a9f65d24a3d25f42c9758741bf88a2889bc5eac3f46656 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5239cadbfb780ce02bb008af7bb7b0b628e180d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c86c56c57ab1c76ad38fcb98cb8633572bf99867062b8d3b80120f8c8b98b6ce +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54a8d16ff8746259c92eeca255d77364ef5385f4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:451c56ec818d3d3b985c5330f05ffe186811c047eb695403fb3497c9cc6a7cd7 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cf1235c18e183d715eb3c9e067d8bab74afddd0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a00e0b5c5de5c178d8b1b4d90fb357c50672b9cc4847c8b1485cdcd1585af33 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ce788a874dd5d5dd4d5657712cedf39823f78a9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baeed35c0bad91b8c175ae5d471f27019ef22a83c4365b7068b46a8fdb54058a +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9b7ed7acc1fb6f91a2e3fbeb41df94bf642b6b2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b00701b6447d6228c8b343bff93468bb8b8baa9a913c17523b4203ea8d056bf +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..103733b0fa902e5495fdc78c6602f856c2d82e1f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1287c6bb3e7f93f8c9a164a0d3a5cec82650d46b40bb65049db5cbf3ce79e06e +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64011764efd84c898a1c89b38d13ae275ba5df3e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe838e49ca894a4c67e8ddb42b1a840a3c0fe7b037a396373747ebd355e6c867 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f482b5fa697aa066557cce1a2d7b233cb929141 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c27bf8e7a9c39e7a311fdeebbc6ce338e542e6df6362787d6c5a93b71e3f1fe +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ea837517a245e588e5cfd02bec7d1fd8d0e17bb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9642267f64b7cb493291c2d4fc01a7a1263d4cf033545f61f841e8d37b1e0be1 +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0abc5875650eba143eb8f30785438cb4bc37c60 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be35c3f79801962da0e708451e44bc6c8f7a93242f082ede72e0b1deffd2b344 +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9a0c568c30aac23eca7ab4f8406115c0a63b666 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcdb285d0e5c934f327ae572094884ded97049eb525527ffeacfcc28c97a5ef6 +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f217ff690cc628b9e919a9a8e40e956a76d64ac --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2e5142118dd8a54b00d55fb0b01d5dde5a6616db226404253e6612416cc70aa +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..191eb0183f68d51070845b3127984b2e8be2094f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5938410c2b3923c235c23d6c3ec524e724c8265d2b624403f1df978c50cfcac +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3a56fdf6fcfab43935a6df6c08a2950d9a671e9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:446e28b72be5bdb67c700c0727517f522ac73da158aedd30eb835043a598e99c +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14443f2d65a902ef78519348495bf33b62dce4a5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50048ec8118b4bc061d9bef8c3c9b1e18ff348461cd82b9a780a763243c5f566 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2fe99ab1a169403f80739f37b6698233c279f62 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0f6dcd08e3d853dca2c40ca0cc56fb6e0bc4f12f0e1ba16c82edb241ffbaaec +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..263f2b282ccf4017601e8be5138398645be33f0c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22302d46abb25fb997c8fff6fbbcc2af4e892783b74ddc185c36a7ee60c4f4b1 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..591f61d64eb5299f621e9b9f823d0b78114de273 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62e9d6384d7635181e07cd8e258960281380659f32a8b74cc97530ff3c6f3372 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b0f2d4dd8bc1d6cd61fde265331f57014ada645 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3eaa594c8873d58e56a1350ee6662ad1599c28f33dae72cc140d9b8bc44c894 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..404be32015305309d923e17fc0eeb02586534935 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6851c5f9939bdf2d70531afc2430778ed5268f054176b352c3b1f1bb148550c9 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..371ab028c607db131a0d9d484fb8d814c6ab9c11 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26921a8ad856bed37e9f2f33ac76c412a690111e8c23711e586fbb3ab34e5eb5 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92b48bb6411661af5bd328daffa9d3fde495ce9e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3d8a8862acf2ea25c8eadd8841886e97b730575d1c844c646506c363d7a0c8a +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..666fb9e5c38c4226584c9a7d5e54436f08e47d8f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbd403c2aed1ce12f6aa5680990c156b05f87ab03b5c1ae228d9c6f6af3ed2c8 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bc95beba4f742e5437d41060acfcadcc3e5348b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be35163ffc430919a2126d9edd2c23fcf02e33a742dfa56f68703c92befe766a +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75eec5caee545993b8cff213e0909ddc2d9217e1 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fef440c267e8dfcc7c9ed74a12092a0e374c95755920819453c2ba5e5770e98 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4e67bdaca57f1f46431a6d43fbb1e990e8b3c45 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52631dff229c81ccd76face486e696c39d57042db9a41a50cddeaf1759b304c2 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10e03954c353fa864d9fd4672f40324de15a9fc2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef7fb67790668b68036275a704d5bf2ab2e0a84bdfb96b5718caa8a517ad00a7 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..461df49226df716510dbd73c90b1062f0a20e239 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79cb95f96624b43ce179f95b6bafe6c12af60b6b007b604c6fa0d89eed719eea +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..102820feabe65c731032b96e968de5ad3f11ba52 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ec2623f9d49f21d658818140693104886e7312c12c6544a304b12d6927c1a88 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd2c5e6b8d750a14db065abf168b095c5ececba0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d78dce823ea85a62e8fcc35777dcb0cdf97306524c14d088a6a2e97baec5160a +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bb845874a84d92e6f06ca6fc143e0a5c4693032 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b63af78aab9a1322cbbf97c09820a9426bbbde5df2163404cdf8c3e023cda1ab +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e3814e223bf86baffe2032170b9a8968861539a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5c922372da64c07d1e4e171c6dd2a131fc3357b2ffbaaf37071cd63f89f076f +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..128691fa345a11a77e8b69dd6f3c58b4f048c02c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f5ee7847c3ccb821a501fb4cf7aa0c5b8199cc38e62e274a826725a342e116b +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92fdf42bb1ebd4b454519b09a51791c5d58a7ab2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:129e9f25c0859cac503a347b1c512eb31a495267ab2d89dd8c23ccb77aeae8b1 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d7f855477b4c03064b3a4094475748e6aa50989 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b030a3d3980e77916bac22e8db42504d8344a90bd8c4a0997925dcb96a1e817 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe5ebbbb22d56b80989c316e52a98cdb2baaac3a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d9f36b582c41c80305f44e10c69ac88f34354e7cdc6845ddf6492e0a3d4571 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2c96137ead8b9b94b835f1ea147cb48f3f527bc --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f20708ca2b6b5fefeb952d5781840e42a3f1063503c852ca5e2024ed8bdd1c49 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf550e8358718dd9dfbd9137df2080478799382a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc3036c667303376fc83e211d78f0a7cbb0bf3c429f459a6d27f4b28fbc81c2f +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebd3efc332cb93f89e54056516aa9ee9a806e130 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a568c3c37e8d69482225a582d60f16407c1c0724da334f62956a8fb373267c0c +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea495963b5f09ccb97afd94dea148f1625b89146 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:948c0892d6fe23cc6b3e3013bc44ecf724fdb9243133e24c5904c626a185a3d7 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dd1234adbca06828282c1363ba43ec0ce541f70 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c7e52dffa19c8f401adbb8fabd0dfc26cd8a3054ea21af941f8e97510519265 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28e79903e2169174d45fea308c8e0c95bf9a7aaa --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1a8685f22565ad26a1e427e9d44ed3ffc93f8d5e61a02291fc8bbe4d1bcc655 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee35659fbc1321be8e4d8ae8bd50598b4e8657f5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c00bbaad82368cc8ae9e202c2d0ab6c7c1eb3c38df4c4e5a576d8db104e85038 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31d8b5d9ca234aa7e24b43b53394c0f914ac928f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acd25358f13bbab8e3851502a1336c95d30e2fb67456d32fb0f31f1fa613acce +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0783f2233cd87e2d8978b375a52656710d0933d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22ca950036a02c05d67983f1d065dade1d31300b6bbd1d62726f715264fa3777 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..248f54b2a1a26af3d52f9ad95911e0e6a1bdaa00 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcf078082f57203ba9781128dca371674fd09e6ee9c520325a5d2437fd7f80a1 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92e383ca61d0c29b3bc6bf166204aa6aed62e7aa --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5da6d03ce21f4192aa9bb166ba5bb7a87836a6228cdb542a669ddfc0c7f94d26 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d821478cdc4effc5fca3c7d5f434fa1cc36d2e7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b77073a07ef0ac90333ffb8e9308a6596eebf5fd31160223f74db109b93292a +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2ff5f387e9366e0b4c5cb88afa4e1cc03f5bfd7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:701060e6b9b9aaf5dfa1852682650fe7dda2ed2510a49f9500d60014bc9d2cec +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2af66d90ab4fb36465d5571a5105d42f07075b08 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:570b8f5d390b3b5b8844521547b0faa942c8453a9f1ade139d05e2a004d4bfdf +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1823bd81e1571b953e2791ade38899a724b5f66f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cab1bde61d2111398391053dce346ee2aa5e1004c0ce8ea879523e181c9e66f2 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78ce8689f40b2d65e5d71d0223059ad2e2a020b5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e62e3ed8fcf7af246dbfb9997d695201942bde7b2dcd0cc535aa6b8973d37a01 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e242fba91d9929b46b97d3edcb8303860cdbcbae --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6933d1e5eddcabc2e235b55dbbfb7c4afe04798a484193da3a3fc8e2f88fea21 +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..301a92a442d54b09789161c57b0a6af59f6e002e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:867528c02fafa662223368f534834424c558d270138d6bee469dafab113de008 +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76d9cbcfbcffd830df954c1819f6afed867ab862 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dbeeac54af0eca36dab2ba115a9f7be4ed4644af77f2c135a2bf5408a8e1779 +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04fe2d15e787d0e13af9d418f9c611571ceab3d5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acbb6a8ce5e5c395f6eba242e2c319d9174e783b32d92198defa8697328d8c15 +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d5706aa3d0ee8e2dff7f9af6b270a51f725c38c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:914292913df5e8e5e0f18ae4a08db86ba65624f49071ce7d613db7886b2cf28f +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..05e25e3108d7b6de9792951b9fa1bd777c9f1870 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ce98ba3d9babd4aa8c2bdd600a6e58896e0ba1e7b98356928e0d875f84d102 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00fa8d30671a7871126c0124b0d44074a6e9495c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec61ff5eb073b9da26b07794f85a9bd17a97d369e6d20634efa8473dc1428b44 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b45ea81636173b33fa706cb783981431162de41 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc6057f1d33f3e69514dd21e73a25fd858330f042c7efe2d1a6df48bb5f86eb5 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f23c90933969c1fce3fbada093cc9848f101a3cf --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fff745d14e83854499336b540a2cb7933770fa504fb10348c33f02e34b0722b +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..631ade6b799935d421a291cce8879f5ed9c76063 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38762983dcadc75a639248403ffe61e2bdb30d14fca2dfbe1e59059cd82111cf +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df62c9d8c0637b29ac32df6f32b598bc3c7f2158 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8add48d87989489808707b88519e73b87b6b51a0cff2c32cb00fdf0b9314e33c +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a29f8fece7330954c8726fda09055a2280dbdef6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6077751e931a672f0f1f79597a5bb1ffd8b6cd32e4dcf016c72a8f0a4cd61e73 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2deca168a7c5642b427ea2eaae1c1e24a801ec9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfed99306e5014f5ed9a54676aa95e1710664ddd1c41d6c965d205f9a6eb3daf +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfa8938768cd043c62ec44c1f8c2acc7947647fc --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:032c1d025b0efe62a3dda39640b7228b54febecde5633f659db152cfb196c747 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1acd0fc7c369e34e99be8d46bc1af4a84c942853 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33e541a23904281a434d7714ad42f092350294405c79e5b4e6722a54cba5fe8b +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f58e7f95246d350a7d3616d5775fb4b0a7108998 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2358b526258b970f35a17e419e43ffc87b7c6b265fa8e3b00ff6e7d194923b7 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a2f54eb239acd77ab99390bfacf26c010897826 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59862e7df5d6e1bfb52bbf055ded5b310afb2a308b8f4c07731d5c517e7dc54c +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e87a241c1dd943b8d8f367fdcbeed9146565226 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86a1eab224e515f1ed17ba0d2783f561d372b60bf6567de01e460ee2d64e08da +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b0ff261d5d181cf4840996a22f0d2a0e1c845c7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be1b2d8d01210c3fb565700eee0217904014b622a3579d51397e5b343a9c4e06 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43ec233d37d120e757ab279eb345709ce8daf52d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffcc579c6e700bcd64e33e2a098ba105c821af3eb6e42a1c14320de4f81dc221 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66ab1d665a879a31c7444aaafd28e6cb92cfc6a5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8d2184649e270f5eaedae8c7f5281f52b39032facf06c60cfa5d7a44972648 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ddd89d142c73640c500917a9d5cf6e8b6959fd9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad4ddd07a99964be01e03b257e2c388d4ce246b997b2885dbfd070e3dabf537d +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..899420d584edd79c82029a814bf17e787b0c9b4f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97baa8cda9ff59f6b624151a3afee0b998e134bed839c255c4818ff572cd44e +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60d3389b518c63fb666b4862f24d3ab87d47a3ab --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cc189d99e540ce94f8cc612d5482e2f0d2a806655691e8b799e381b229b1c58 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59e71e9e5660e072226c41dfbc2931fd5b27fae0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6606d36adfb628b60cbbab3ee95d4a2a2345ae6b2337191d38801f609a2e02ea +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3d1c063d03f89aadb353f9a8b6c9de24f335f12 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7e0f22a30c63ca51b57421eaa4b6649c92ab16408ea7a2ebab934cbd2bf7111 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5dfc1e8f5c95e379ea9456d9b0aba407471a86d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b45611be9439225ef9e043017ef50c2808c4d593fe9bb95d78bcbe71c12657b7 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44f467866adfd066df3a49125cc8bc6be1e6e1fe --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:804c52552877b19a6773229d7d163f1de36c6a85eb84d8160fe4ac55fd229511 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc964a98aa00a9c58a276559779edcab67ceb59f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad4de6c9b24620ae296219e3f5d3b5f7a5f79b733b944edc95ebdb93ae95d9bc +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73b0406e2d8ebbe85656d19200d911619274e59c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc8ad389231dc18a2a45fc1e23adf58093c45fd1ceba103b8c79c8c17a94877f +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bfb6f4a672e4ec27f9c0adb30561a914aaf5b7b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f02601d3ec78925cb2a8d8a8bda601934bf6440515da4ff22a4b1ed46be1714 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1108c2129145d54212192465b35c32c356181c64 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e07e6452d395a0be0d972a8cc3b011907b22261ddd8f9e69960e4d0ed9f39b5 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..619516574abb1de3efd780e422dcffac0ecc760a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e6a6643ca3b0dab60c272f14f51377d1cba15f2c40081226996ff739ce24fed +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44fb159b0259a1e92b3f38fd71f4ff2e07eac9b8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d00023eb78049ed1a16442016f6ee7bed8093740cde344dad2268e9ab7fcd945 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14b0477b39952e1246c139513f0ee7c8311c2b6a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27687cdabb4a8b0dec4c0b1e1508e232e728c63fb27be7a9232428e26f7fe105 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5d2d04e2f8c1c39e5add98f58c2d0077f341315 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d3f172e30bcbea07f15ddb8c5a73a0b70fc5271c8527149b0ae59dee8198a93 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a27f164b5fa49c04c00450700cf2b0fe2927bf2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e03ce36a7c64842679dc6f7319548d24501aeb6f1f2b94701c69dd6fee15c63d +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..114ca546ee87798d40f4d83075c2a6c15fb080b2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71c5a186509a66ceb285f1eb0fe123c0476b74ee25960ddb40bbb7112c4a522c +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e67d1db1199901b6ac0da242b1b3c346359e4cf5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5546e610418f1342be7d4580b57344762bcd9e8edb2106441a5854dbf437c04 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0beae2a8adc321562a644a5ec4325ddcedea81e1 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d157462a4680c75f0ec39a2d82f5482187c3066193b23b6ad01eef3d92274bfb +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54fe7cad62a62df185d07264aa89186266d00584 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36cd510929cdafa4f34e28386c4831853e8a85061a8ba9251dbe3e6e9c7161ee +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b703313284a3004947a01f2a7b1a8ec97827d8a5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fea9363ec276256d4f10cbe063916ae21c3c0d2e37cbf9f5af801eaa16ccfc33 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab9d1fa6bcab0051d5d2041e66828eb44a921dc5 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad03aaf23d66c10cbad59103b2b05c181b1d1844ffa07720a94396dba6aa94b +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72a84594100ab82cd583d0c2f45d8f50643eb3e0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da47675733fa218b0f7b757858a04edb4c839406f4233999455fc78e677685e6 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba6a67f4e706009d939b8dc9454318a52888c7fb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37ed6dca3d35d0b9696377f33ffdd36d8cfedaec22a1818914513a04437b1d5a +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8554ae5aba66cddadf3dac31c84dc18ed61d5816 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a174b06aa9a1ffd88f1b27c17b85acdc756575808f28141a2068230091baf3e +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9e580e5382da37040480407d298e0a391046b40 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e84982f8ca98bf5fa2429bae7eb225638624a272b5987915acf344bbc6604630 +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..479db19bb2f9a32a539d1dee80ac34e7195fa633 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5c138d564c36a850d914757a7a5eb1888508a0cc2ea971b9de03c6f04df4b16 +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d477df8b4fbb5c7075e90f4241722d32b4a9136 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68e5f788fde504641debd51bbad185b0730ebc29610008e682e32c2451e44bb2 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f74fc6ceb0d170908d981ec955b254a90edac915 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e8cfadad2481cde44856331a6e8141e76da5fa7dc8b95b27945e1564768924e +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2011c10050f72d1639998a3155b6676417315960 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b85cfb4145fc22850ec0cebe277e2f04ecdb8c4aea54d58293ee6c06a34bcf0 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..691b5cd1498ab16d8de5b27035023cb7aa0c1b6e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e18a98bb8563d05a20574ce88dbd1b7cb56bfbf34268606a0e13dba125c2ff3 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c85dd072bf7af5bef389997c2c5d659906e7b72 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e87607898583c1d5e01fa34a47e296dd421b303679fbefa8ab819434aea55958 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72accb4d8e7ce1f726f0d370c2429f997ea9a896 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f0fed0c5b2d00283dde85fd6c6e9040428c55055825cb117bf2b40af3a3e7c +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..745d2149fac808cf663041fe083de8cb75f03d4b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecf30ab8da62adcfa6c50a903a3832148926bb66ca0332dee5e672a1494295c4 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0e9677f1274f5c6950289fcc6ef30f8003e7362 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be6b53724e29501b6f1e16965b19c0b7ef9d71371c540d17d87cc9473d31d762 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6830a125cae661932aa0bc55a2ace87b8ba40a24 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ddf5dc53951fdb8331e1460679693dfeb3b5a9193c34d913922cc371d857db +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24cfcb8b8a558e528d58f81a243bbe0b5226dc3b --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c95e6178bceecab5760f0f1bb186acb28cb1cf9ccef326b56628af541b91757c +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..974f6b7df8c4e6d0355ef48a497536a6b3302272 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4591ad64dd98f60e459a4a2f2276e727d64e9a13d2378151f1a3cceba2c980b2 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2381af09b9937777aef083722b6d033921ad6ecf --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fb8cb17db8403b5fb82aa7ac57c59c1e15e1a2d8dba05fadbb865790535716f +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4ac0564eae8ff8e08c3053137ec737c14f3c90a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31d9d45585742c1fe02f0651a360b3acbf9636a4c50acd6bf2d729d1ff31afba +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed2d444089593a5bdeb2f9411efa82e57c0ad76d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50ad5cfb9b80d180ccb0bd0078c7f2bf9e3aa01ec5f129d5da1ec4ea31efe493 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf4ea3cad4a3b80a9cb40899db53c872e0d3a3cb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c8ee982db7d08f68155e967307eaee808940e6e2aa894217d665c29b01119d +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f2d7d0f15cd18c21d60dc02657863abff4c6ccf --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5af10d2aaeb3377e1a7e2d514afb90e8da51800912149970fbe4a68674451cad +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a9d8f4d5a069b11616b6899a093b518e2e07a1c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4e99d53ec767e367b436a54a679af0299bad3b9c58dc2d5b7cd247825a22678 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7c1b794bf98cda723f92236e110c6d296aadcb4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c3df6b276fcf870da468462c1400afa06edd548d690fa70f3b80e363cc3afdd +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0d34aa74c7e70e940ef9dd34fb7008d3a25f094 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb0cae8383934d56c4b520cf9cb4df7723826d3c0fb182e720aa836ae2104894 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c64d36ccb431e4a4d6d66758be49240772ade900 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d8e0a039a514dc5437737d0faace23e740fb087ec80739dbf395e4f979aaca1 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..909c61b86cde61fcdedec0a755d649b1499339f4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:128980103937b81c4307bf962b829ee666330a6be0da932e7e3f993f13e6f577 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b24d37968eb11f84b3ad40525a52e4c93ab645d3 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6a421b0641d53816e7c534d66332d04645d375f483fd96832d18b2a13df3a5c +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef953f4606e8e971fb53124329fc90008ae6b1c9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35581b6de8f99e5f19a166aced6cb276536b1a24953835f1752d798912810e84 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..912cfe16e21da69b19ea6ed43bc95ef43deb1273 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:435d29374403f91000bfd0df8caad9f6ab184ae20df397ad1ba19c9c11fc365a +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ba55018a2776f82eaaecf4e6899d687502c3774 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:282c1133809f7c0759f832bba9721f4bc06a4bfae761c4387401ef052f89a59e +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad6bcb1054b6c3866dc092eee76f99eb7d9e6394 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86635f13df31953895e4456c27ea220c657353282b9d3c8c2abd861cdfbf50a2 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77bba7a332a2d2bcafda48178dee5f2c54d616ba --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f370bd5968bf6c10abf4a447b26326b7f86b5884c1e0ba082d8948c60dfa81e7 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c9c7c92300ae1615b4c733a56bf7e4cc388c25d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50221868d55aa1427565d018c3f5e4a983464f84d54224aca786c721accf5fbb +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80c51e238e942017cd81dd945f2fa49e6cf92eea --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e128305ea65982078245809de5f1c62dd803bdb490fac6510d0a81118b4363ca +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7b3499bc79a56956f4a8116bec8367134178a0d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:542ffc89da03c84286dfa814df4d7f850db4eafa0d6ccaff52ea2aa534b57f94 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3725ad6d0cad2a2cf56a874e8f502e5daa8838fa --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f1ebbd99c585cd6570c0418ff2a95a53ef4e5a576fa07191c370f24b6188fe3 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3887a69ed96df5f476adfa413ffff1d660fadcc6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29600b8c2010edad08655580fcc82eb090f44eb397c3941b278c7a78580ac548 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e60ba0b72b98307917886e45e9b9ea1421e96d7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aa9fa871af8ddeac1eaf783a5960df3929d0632d0542c5fd3ae0295f3ed4b65 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07765495388d509f170700455ff3b9625a65d035 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db73478f9f4f4ccbcac0914772deb1f39b39d962632386afc07fc6bb20b08efe +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be29d9be8274a72adf1796c498f2a677ab6341df --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4d5a2702b49794ddb6dbe4ff181c25c51ab94985a20fb80689d864a02194d90 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b789c6b963882851143127a8fb770c1bfae13001 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e803ba5fd3b774dc135be6af52861ac6100b22994bd012379c0e985914e9b1 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1a2b1f572122aa84c78486be1d764c819dd614a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d62aee4159b093195c73649c7aa1399d044afd09b2a0772955fdfc12d5152ce +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71a9f9beead2c23a0475ef29aa83756d2a4e7e9c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14e7c4141d444385a2429fbb97fd5b51b3b834fa3681af4ae67f33b8ee288559 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f92355c66a52fe25716b8c60eab5601d6a65a01c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:379f9fcad5246f6f903cc59673c7be8b60fe32116883c74347ad380227fbb0fe +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..162f747de9c29bd6eaa4785e386928970e666c59 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59a811df2828516b7abed2b90dbe5893b9efb12bf4fbc389b7f160e646fb2028 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..630619bad5f34ea1ff1f25ca79e8067fbf08c5cf --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4d47413740430e588761b84a15d6c1fe62f4b3406dc87d4ab088beaf23f0272 +size 208731479 diff --git a/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cea7db83062b9956ea69f4a511fcfa2b7f7ffb36 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:930126ab282ad1d942cdbe1aaf1a66989dcd55a373664ad134b2dc854e7f1340 +size 208731479 diff --git a/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b88f2c0f8141151af90ddca6adf9f7b8253f8834 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31123022ecae8edb3b950e3007f1e141a972e315496a03b6333528e6d5e12b41 +size 208732247 diff --git a/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..daf891f6214067515b6a95173ab85c4798c88461 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f842ff5c41cf0d43632a047e6003b768c86d0e1886d43f22bcdb491fa3514b8f +size 208732247 diff --git a/global_step5494/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cebbde4eb3f336d3cb0a2e0b4e607d18eb2e8ecb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede25e549fe2024bfd8781784102c373250538d3629183a5d04bbe5edb50e86b +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28c9b60f0aa9d3f818b8daec8ebafb5a8d7c58fa --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26bd8ded62db57562ca981b6a70804e2d380ba04b25a6d731bbc6b8a601dbac6 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa8b28f466a7f15e9927e68444517856b51ba965 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc72d7bf48c8750187ddc3a62bda790a8761789a2b2e9f467fb406e30ef766e +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ce0c118b8ae2574c2fe206fbb587a93e2f81ddd --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a3bd59713abfc01b2d39abb9351021dd2e76818e5e3d17c2f33836a8f52b6b2 +size 208732066 diff --git a/global_step5494/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67800c57c077b9074546d6228178ed7b951640d4 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2a09f84586ae09845d29dddcdfecfa416018b7e6e72c874ce7e0adaeee1c044 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4821fe040ba2bcc2f982899daf69af520aaecd1f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41cc45ad43d9c3fba4f48e225901d26c025ef8f6d7af4119aead6618e2ba2684 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14ce222f27b0e3e95aaa930212c89ecbc4533f09 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8469d7a21420406795529413f32265be8afb5c6b5a7f1644de8e37f61c9d5f5 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..791a7e0cb574f3078fad3bc893273e578d4bfbe7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37f61aeb464f864c86946b94489ac84cb5fb94f92c098bd4ec02e57409845ed9 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1685ee014052b5940f5199ca154a9cd7b438d7d6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6cf00d46c8f3f4703bd40b5ca33cac585fac49effc8ef5a265fba1bb7fec1a6 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ad53dad510b1e034d212945599279be4cdb7af0 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8772e05de801a1ab0da05a69ac35cd043eb0487833a883d23a7bcd037a0f3533 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b638290b74c1169c813003f34aafad9a33ed81e9 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd972ade042b081902112c10360fb35f8f4ab499557282f9a3f444ac9328db7b +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3bbbbc90eb541c4d8c255384fbef9eafc5de29f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:639ddf6ac366438a1e12e5f8d172d7f8351083c86a0934e3c82a7a5fe6b7e583 +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..254507507a5e9d74ee6c80d3f65b84115efcb947 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd3968346cda0fc6df8856c584cf65651497f0a146d97c81d8940d12426ff5ae +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fa35e3d799f3a6d556f604a2dec01ab7dfcfb07 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e3bb92bcf40681dbbec66173ae200feca8d2a2e456df668bc18d36013111be2 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46f153e353ce11cf4774bebf85d14c97cca2788d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e3c0d21d60d8f9023103e389e450e9e9d94914cb303fb51889dbb5ac9aff603 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b14c04f312bf2aba8e8409e26134777c79653ccb --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c25f2ea55c048c0535008f21c8469c9654c7708a2cc9525d3fd55dcbedacdb1 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c13c8c552f936b820c884e9101252a25b792603 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac8828bbcdfdfb46e774f2cf36471bc8edd83f841a490cdf828d282de7a9d429 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..189aef93d2cc29749fea53763eaa38b563bfb6ba --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2769fc8173aeb9fbb53e0750618e4a5e46e5fc76df5e94ea482423902a3c3e9 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f94ea38e2460300678e4e1d31d6723850c67069 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c160625c241e4924e31f355585c485e2b1e1dfa3b9ed385b00967d418d04418b +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..457881f0aac404097c262d9ce21ea90b273d8fb2 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c2372ef403880fd935a6f90e83e8ab33cc3e07a8b3eb1446570cf0c2a041f2b +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1614f60d54203fa03bfaaa321b182a3a825a61dd --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f897d2420afbd28911743432bd2a0c3594fcb8f42b110dd53268e052b73a7d18 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..209c498deb8bc53fff61b17ac88df22a344b8da7 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86ce393c0c82cc3a7df92a9cef2ffdd47a7645e514c437b0cbb05a9b8d277175 +size 208731490 diff --git a/global_step5494/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..810594a9b6d34c50cffaaca0f9595a73e169ac11 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a045dd7cedce8e3f0da15d7c5e7e9e346f539c43f14883d1b045f14515529e8 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5da598642798f263a8b50670c90d373fb670249 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10bed38bb95e6c79d05c4b6ecd9792aa2bdaec6ff2cbd5ca092bff0ed3bbaef5 +size 208732258 diff --git a/global_step5494/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c81e444735613ff788d892916a5dba8f099154a --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42f6d47ccbe23afeb9ea1d43fd7081389dab09245c5f5b0aa6004c7016ef07d7 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df49b1bd6a2c3264973e1ff0e7bb45e5be925897 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f980f65428137a06e581e556b385c1ed1f106b01da7a6bd2b3566033c13bebd +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..efb09fa32e04dec999e6cb7c63284f0a57dad44f --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abac180830ce6ec0bceefa66669d5452e2d3340ee3319c832b0f6d654badbf3a +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4032d9ae26b6740cb65e55572ef82f5ae80d7fe6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a2fd427fcb0aca7084d27ba28c65e89b9d8db5bf8ec10436cb1ee0075ebdf54 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ced841c652ee5fef72291e675298897d26e6c0d8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edd7a042b3c4496d9e3e4aeedf012c0fb4de2a144dcaf003e5c3e3b1d07ae64f +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64160b47319fe5d90aafd6eb9a091474e9fc21e6 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4867f167b09cfd1fdd431a3605a700273b73750c8befdc229779692468990b50 +size 208731298 diff --git a/global_step5494/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88a79ec142d3ab12d42dab6a4a648ba37b1a071e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b94dbec1de5943abf87a751e4b0c835a55b73f47d11d2cb2fc3e9fce4c70c6e +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ca81508be4a7ea5f3876ce7b9e6eb8b2bbd4d96 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd7ff1560d618b8384cf3f6585557145862e65fd9ccac731687fb89e2bc67313 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75fa820ef424cab35c70bfe1c55284d5ec419cb8 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1b0ce0cb7353a011f11db0cb1d029ff0927b2a6f8f3ae225a584b24494ff10c +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..132040b695df4b4815f1acd4797c3ff5241fd2ca --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f4d6469f0e5a5a4bfc7c1b257aaab564d7ddf211e5fc489e998a61601c780c3 +size 208731426 diff --git a/global_step5494/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3eba86bbb8ceeda16ac8a42f18020f04ba3b8568 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06aebe1d347b9025fcb9633be96bec99035f7951bd635cab56946af781045ebd +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1112e6961f1cbcf74feaa6cded2430561fac3040 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f92ef227f4366c43c00852b0522a7d47695978860783959fab1719c680942c7b +size 208732194 diff --git a/global_step5494/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc1516a135d89fd35850a927b119641c054f6a0e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d598e363966ee716a33d4ed47037b51862515625dd83d69c74f6847580f91fbe +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a64864e0628bcf2372b72146bc01abbf324675c --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ca19d5c238a4674c467118605df857cd8a06a42820d75f8b52f8ad39257ea32 +size 208731362 diff --git a/global_step5494/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa9757ec7190091c32738f2c8de14729e30e5796 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5188b5a60ead879900f0f157c73bcdc70aa81e68f931b0c69cffcd2cf8bef81 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5160357ff95b972ff9cedbcc2452ac35a6d040d --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:610fa33435d3c45a336fc5640b5943102389cd47261633fe8e5421bceee97e96 +size 208732130 diff --git a/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79c670acd72b6ea300c89264c835888ff1bc1f2e --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd20731cadc6248c14de861d6249d3dee1eb97faca61247870494eb32134d2c8 +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e296cef084857036da72a8117af75dea131e158 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe7be386b00dbc4f644449fe9d363e79fc6341bd47b56ee8fe1c444d1e516e8f +size 208731351 diff --git a/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt b/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13d8d5b8829f226b9406547b507774a020d93e60 --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0295d27a65acd51c00fec5fa0658f3b545bf748f5d2b6ab033f70a65f11ca4af +size 208732119 diff --git a/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt b/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d262225062ce3cbdba7fb51bcc8b252a5cf93acf --- /dev/null +++ b/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad81b46eea8569a15d1d9e283994f72cf61aa39cfdfae3bb5be97ac71426b7c7 +size 208732119 diff --git a/global_step5494/layer_01-model_00-model_states.pt b/global_step5494/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f569a764f952f7d7191815a2f3971894cba6d67e --- /dev/null +++ b/global_step5494/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957ad11b8d357ce8d577f2efb0c570852bf1d5f889d363c30a2ac0af183d5f26 +size 223347971 diff --git a/global_step5494/layer_01-model_01-model_states.pt b/global_step5494/layer_01-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bf1490cf39a733c0a0fecc91d5d0038b26a3737 --- /dev/null +++ b/global_step5494/layer_01-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27afae6e2b013be47205eca50f9b8689849b0dc9bf731caf58d481da6c2931bd +size 223347971 diff --git a/global_step5494/layer_03-model_00-model_states.pt b/global_step5494/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e7440502ee14f7030ac0bedd0886f16d6f82dbd --- /dev/null +++ b/global_step5494/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebab360df9cd71add32f6378225ddbc272cb0c693148daee4578bc74c70451da +size 201408771 diff --git a/global_step5494/layer_03-model_01-model_states.pt b/global_step5494/layer_03-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cde04bcbb791487bd838909495229c54269a874 --- /dev/null +++ b/global_step5494/layer_03-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b4933d14f717bfa4ed473cf6497ee2d9e9f11bc8eb791348db05e91228aeeb +size 201408771 diff --git a/global_step5494/layer_04-model_00-model_states.pt b/global_step5494/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d4ae3e7bad2bc8d36186573c29e067ab09d6e07 --- /dev/null +++ b/global_step5494/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c9788791cbce87fd4067754c55da32c3df50c94df8d033fa7d8bdb627396218 +size 201408771 diff --git a/global_step5494/layer_04-model_01-model_states.pt b/global_step5494/layer_04-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..acd994b61a4245702cc12ab8d7c5655d44b3383b --- /dev/null +++ b/global_step5494/layer_04-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bc312570f3fdf47c0b7e067497dfddbfe87431bdfbfb0f20c264f938896523d +size 201408771 diff --git a/global_step5494/layer_05-model_00-model_states.pt b/global_step5494/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e406c2677488c69266791eff6dc36e7ffd155e9 --- /dev/null +++ b/global_step5494/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6209a8c3e7d50d394aa74e490fb817635e3a19fc343bf3f48baddc03ef2ab0cb +size 201408771 diff --git a/global_step5494/layer_05-model_01-model_states.pt b/global_step5494/layer_05-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bdc5c3c9d266f992c9c211a0c66230089ebb35e --- /dev/null +++ b/global_step5494/layer_05-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9847005f67cc07022dd608d64ab2a488d344323d526b698ce4621c85c4d94f7a +size 201408771 diff --git a/global_step5494/layer_06-model_00-model_states.pt b/global_step5494/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e118d26a6629a3633843d0c58a66d8377e1ed0c --- /dev/null +++ b/global_step5494/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45872cfc734de4910a1e5ff7db44cb72f1c6cbe32d02becf9a5509e5aaa9d033 +size 201408771 diff --git a/global_step5494/layer_06-model_01-model_states.pt b/global_step5494/layer_06-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7ef9c06f5e63c79e0d0f3f11794849bb8836da8 --- /dev/null +++ b/global_step5494/layer_06-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80ea636f02aa9ada3a17e0480423e16615aec21586b23dd2e8a6afd8df62d697 +size 201408771 diff --git a/global_step5494/layer_07-model_00-model_states.pt b/global_step5494/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f345238eefba7d32e39ecd605f6820bad1ff06bf --- /dev/null +++ b/global_step5494/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:936a2bf16c9192df4a0f7c270cf4ccd41a7ab5b3903c891ac6965627638ebae2 +size 201408771 diff --git a/global_step5494/layer_07-model_01-model_states.pt b/global_step5494/layer_07-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f1a6f0694a8eb1d80534fb355490dc4d43aebfb --- /dev/null +++ b/global_step5494/layer_07-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbfaaae800323da6f58f5886bb3deaae1afef427d8f5eb8753aac1c281475eda +size 201408771 diff --git a/global_step5494/layer_08-model_00-model_states.pt b/global_step5494/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59e17423f0e92365c1cf6f4eca0c8a90732bdf5c --- /dev/null +++ b/global_step5494/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f579540cb50f6724fe69e1e18b3b86935974bd6a1ed92268bf99a17609a3d10 +size 201408771 diff --git a/global_step5494/layer_08-model_01-model_states.pt b/global_step5494/layer_08-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6f3a56434f8c0391ba2f0ba31694125eaeadac5 --- /dev/null +++ b/global_step5494/layer_08-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9f087d88bf265e8ee8f700cbee4b132363302d3b52e9d6bb36dbf683dc5005a +size 201408771 diff --git a/global_step5494/layer_09-model_00-model_states.pt b/global_step5494/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3181ffd9ae7bf5e759fcf7b2f5f0b83967ed7d67 --- /dev/null +++ b/global_step5494/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b578f2276cce06ddba0c67a3501e196d69398b36dfcd9cc02740d9d1808aa1c7 +size 201408771 diff --git a/global_step5494/layer_09-model_01-model_states.pt b/global_step5494/layer_09-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a530b398aac21f80ada18cee7cf62d3433b5fd5e --- /dev/null +++ b/global_step5494/layer_09-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f42c4cd0d31928bd7d15d786b7cb61366e2fe53d52217f0879cc69248c575fe6 +size 201408771 diff --git a/global_step5494/layer_10-model_00-model_states.pt b/global_step5494/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..085c50a9670e455d303323bcd311224b2f61dd71 --- /dev/null +++ b/global_step5494/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee5a8ade3a518bc94a376c6127dbc300bf0b3765c114e26975c15f95322ab056 +size 201408771 diff --git a/global_step5494/layer_10-model_01-model_states.pt b/global_step5494/layer_10-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..badde0082736d42a0bc28577b9d1a7564c8d7ff6 --- /dev/null +++ b/global_step5494/layer_10-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52c32095b6e19c9617256cb5214d3480e0571caf14316b5591dc68e386bcc0ae +size 201408771 diff --git a/global_step5494/layer_11-model_00-model_states.pt b/global_step5494/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86c6c64ca7ddeb1505dd913702683096c0b5f0e2 --- /dev/null +++ b/global_step5494/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88d5e09b01d5b9093d4376699cdd4037f2bbd2a1c96b9cd204bb5fda9936c110 +size 201408771 diff --git a/global_step5494/layer_11-model_01-model_states.pt b/global_step5494/layer_11-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c828441059734ccfe00c4aad9d8d779ec23c677e --- /dev/null +++ b/global_step5494/layer_11-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c624b1d131137e131966dc8ed3d65318112bf41f416c4c1d67831f729061a80 +size 201408771 diff --git a/global_step5494/layer_12-model_00-model_states.pt b/global_step5494/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c75dfac6cac8b043a3de9e1b451824de001e9b8e --- /dev/null +++ b/global_step5494/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2c1a2d8bb5612b329f773283b9e4a17305a9ea17ba30ff879fe5f3dfecbf80e +size 201408771 diff --git a/global_step5494/layer_12-model_01-model_states.pt b/global_step5494/layer_12-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d86625d494f513b1cee50948297372669f979e62 --- /dev/null +++ b/global_step5494/layer_12-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43472ec452650219d536b2932743133a9d7b16bf4ac0ff02b7a1512ca0718f0b +size 201408771 diff --git a/global_step5494/layer_13-model_00-model_states.pt b/global_step5494/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..748e424c51673cbcf7b4ee65a1b38a53ba6705d4 --- /dev/null +++ b/global_step5494/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7717d30787a4a9373b6b5df2e2db774abf4facd64d1863f93fc8217858dd79f5 +size 201408771 diff --git a/global_step5494/layer_13-model_01-model_states.pt b/global_step5494/layer_13-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1833377f88951ef28819a2de31a1ef3573846e1 --- /dev/null +++ b/global_step5494/layer_13-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00777c0afdd9c9d962bd311c69bab24764c090dd4461f0791ce98abceb405aef +size 201408771 diff --git a/global_step5494/layer_14-model_00-model_states.pt b/global_step5494/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0efc6f1a8beedb77e7b4d128de733560807eb75 --- /dev/null +++ b/global_step5494/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72b9d2d6f2a6c1ee2a292c98e8b321cadd0ef8af91ea2443b7939a8be036bd38 +size 201408771 diff --git a/global_step5494/layer_14-model_01-model_states.pt b/global_step5494/layer_14-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53e5e173f5d45da54b219106d4a8d33bc1d477f0 --- /dev/null +++ b/global_step5494/layer_14-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc8534b31f2000f89bb5929f2b8ad5974de63bc8177e7effa7b71896826d198b +size 201408771 diff --git a/global_step5494/layer_15-model_00-model_states.pt b/global_step5494/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a34dcfda45a30b84384badb2ca69248ac72f2b63 --- /dev/null +++ b/global_step5494/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b684e3173ece911bf683d9463071d518b72b98018f01119b3df15efdc241633 +size 201408771 diff --git a/global_step5494/layer_15-model_01-model_states.pt b/global_step5494/layer_15-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..721809f6d43cf881396683f1839e82bc1480bef1 --- /dev/null +++ b/global_step5494/layer_15-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b99293a482679750778d2f919e05fd7a6901757b1562aaa658a915e9bdc3ae2 +size 201408771 diff --git a/global_step5494/layer_16-model_00-model_states.pt b/global_step5494/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd57f2756b37e73439a7695fc338fe01ce149cf5 --- /dev/null +++ b/global_step5494/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77a6702e5c23771091d42830d476e60071c9fef06b86f0e79a127e99c459013a +size 201408771 diff --git a/global_step5494/layer_16-model_01-model_states.pt b/global_step5494/layer_16-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72ceb8a033c89fb9e1832975b6caa88a1fc7e13a --- /dev/null +++ b/global_step5494/layer_16-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ae683bf28cf870d7b2fa820646dd4ae8761c32f16c5ca898d475c78a7fba5d5 +size 201408771 diff --git a/global_step5494/layer_17-model_00-model_states.pt b/global_step5494/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78fbf5cae2d8b9776d93b60856d48e88d2fb5525 --- /dev/null +++ b/global_step5494/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e982c7c833fc0f32e1404092228e6cfd9f36e010a7b5ef7325cf0e65f851262f +size 201408771 diff --git a/global_step5494/layer_17-model_01-model_states.pt b/global_step5494/layer_17-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f12c868c133a8738afc4a075c1cb7cc4a16651d --- /dev/null +++ b/global_step5494/layer_17-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9157db0061fb8368e63f4899ace458d67eb50566905b2cf35b1dc2d007b57bee +size 201408771 diff --git a/global_step5494/layer_18-model_00-model_states.pt b/global_step5494/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..092cc93806f5351f5339bc30809a10800d78b8d9 --- /dev/null +++ b/global_step5494/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d6ce610938de05768e30da741763b236209929100831167ee30b85813b015e3 +size 201408771 diff --git a/global_step5494/layer_18-model_01-model_states.pt b/global_step5494/layer_18-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..766ffc8573fb875d8c929e84ce3fb14ca46086ae --- /dev/null +++ b/global_step5494/layer_18-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a5146ac31c7361b341e6546502bb6fd6ceeda55290834e8b101d4f5240688cc +size 201408771 diff --git a/global_step5494/layer_19-model_00-model_states.pt b/global_step5494/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b14fbb772d94fc412534e3c52967e85ea3362787 --- /dev/null +++ b/global_step5494/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b7da5d99bab44567bd8c5fb2ef992c9e444c266cf7bbbebf77d7a229752332 +size 201408771 diff --git a/global_step5494/layer_19-model_01-model_states.pt b/global_step5494/layer_19-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d79d7748769f66ff7aa75c398556a51f32facfe --- /dev/null +++ b/global_step5494/layer_19-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a3eb02defa896d5aa744b8ca4799e47c29871608af6121f41bbf1ac1072c9d5 +size 201408771 diff --git a/global_step5494/layer_20-model_00-model_states.pt b/global_step5494/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..94adb85665c25e383579db33e8742cfc7b96ddee --- /dev/null +++ b/global_step5494/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33fb9713b880f4969a23fff0164075a0017b9f2485b9f1b13a374a6061f211ee +size 201408771 diff --git a/global_step5494/layer_20-model_01-model_states.pt b/global_step5494/layer_20-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e6c93e673ab21adcee1a05c0b1a85000bc83702 --- /dev/null +++ b/global_step5494/layer_20-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7557f5806bdd3716e24eed780d0ba6a57c4abd717da775940d35d2f21de1a696 +size 201408771 diff --git a/global_step5494/layer_21-model_00-model_states.pt b/global_step5494/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab1be77284ffd7164cb919a46edbafcb54106a21 --- /dev/null +++ b/global_step5494/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41a46289702832eb94b0633bc3740c3911316bf1a32646a2cfe44df8abc504cd +size 201408771 diff --git a/global_step5494/layer_21-model_01-model_states.pt b/global_step5494/layer_21-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1cb314b3679d2002dda7e517f70338610807be5c --- /dev/null +++ b/global_step5494/layer_21-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a00b663806279de5a0c377046dc6e22cad2c943c5727b584a220712850d6c7f +size 201408771 diff --git a/global_step5494/layer_22-model_00-model_states.pt b/global_step5494/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55a0cc3a8dd5acce960a38610dc4a1757d2f4166 --- /dev/null +++ b/global_step5494/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bbc56a70bd16bb15a1445243d8a1b25c99e7cb5f2200fe9d2a927dec38236e3 +size 201408771 diff --git a/global_step5494/layer_22-model_01-model_states.pt b/global_step5494/layer_22-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e34646ac343ed9af54f41ec662124f1c9e9a7b52 --- /dev/null +++ b/global_step5494/layer_22-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24dc5b57c7da8acc58aa8e2965549045b49ebc0bc72efe3f4cf6df853b798934 +size 201408771 diff --git a/global_step5494/layer_23-model_00-model_states.pt b/global_step5494/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee442739bf5dcd89cac178692f0f096feab149c4 --- /dev/null +++ b/global_step5494/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c57650ffe402b8b94e17b10bb51e94719f0b55f2dd70076d50bb488b602718d0 +size 201408771 diff --git a/global_step5494/layer_23-model_01-model_states.pt b/global_step5494/layer_23-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4104c3096c1425a5c5b97fb5265c10ab101c9d4d --- /dev/null +++ b/global_step5494/layer_23-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c265e2dfb5a0661c130fbfdbb18467504dbecbf7e41c3f3d1d8192444e219cae +size 201408771 diff --git a/global_step5494/layer_24-model_00-model_states.pt b/global_step5494/layer_24-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5e615e65453aa96b3a17f769f493ef2ddc82d12 --- /dev/null +++ b/global_step5494/layer_24-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afe611c87ac16a126e0dc9b6ed0e325fdc78d136f1283c0c29845352f93bcde8 +size 201408771 diff --git a/global_step5494/layer_24-model_01-model_states.pt b/global_step5494/layer_24-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5b4d44b60b5c6a6371b0b4ecd434fc57da42ec7 --- /dev/null +++ b/global_step5494/layer_24-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1c534d17a62e5bd563d2344ac92ed3c9544d5ee381480a4302f946b2a5c89b1 +size 201408771 diff --git a/global_step5494/layer_25-model_00-model_states.pt b/global_step5494/layer_25-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8605c22dce2d8a376a6f4ba2b8e8e17490b6f8a0 --- /dev/null +++ b/global_step5494/layer_25-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58016dec168da520c9fd872a60a0b5d03d4c5866fc4dc869a6b6d52970be2e20 +size 201408771 diff --git a/global_step5494/layer_25-model_01-model_states.pt b/global_step5494/layer_25-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..848454047f23d8c15a5f1555be408927a517c581 --- /dev/null +++ b/global_step5494/layer_25-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68afc3fefa3b7f7c5c606567849f14fef2ed56065564964ec9c7f3061ce0796b +size 201408771 diff --git a/global_step5494/layer_26-model_00-model_states.pt b/global_step5494/layer_26-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56bcb869c1cf4125f3cdc1f7bd191670f9dfa88b --- /dev/null +++ b/global_step5494/layer_26-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b425331d0fcd0ec76c18cf9cbdc5e9ea538b7ef56904e4e9fed5591c790a33e2 +size 201408771 diff --git a/global_step5494/layer_26-model_01-model_states.pt b/global_step5494/layer_26-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e544aa06eb50038bc53230f92f3537b7303e7861 --- /dev/null +++ b/global_step5494/layer_26-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bbb8250c91fb8204ed8044ad3a9990efbac044d653759c7b43565e8606a0630 +size 201408771 diff --git a/global_step5494/layer_27-model_00-model_states.pt b/global_step5494/layer_27-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5fe2bde082e54d2412bc96f3e8fe1f39fbb2fc0 --- /dev/null +++ b/global_step5494/layer_27-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dd52a20f4025253f90e38712d91872618e0269f37580531884dc3a7185ee375 +size 201408771 diff --git a/global_step5494/layer_27-model_01-model_states.pt b/global_step5494/layer_27-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6649ba1b4a24c9d097d3957687a5dc5552fca11a --- /dev/null +++ b/global_step5494/layer_27-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e48ac9808f40308e4b27cf1d6ffe38e72b7a4e3482ca3128119a33205c1c8435 +size 201408771 diff --git a/global_step5494/layer_28-model_00-model_states.pt b/global_step5494/layer_28-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87de4ed272719ffdfee2b5b3157f61a684d3e1f9 --- /dev/null +++ b/global_step5494/layer_28-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92974278592fb8c3c9bafed8e5f825411e6c31fe97739440cea294a189e9e6f3 +size 201408771 diff --git a/global_step5494/layer_28-model_01-model_states.pt b/global_step5494/layer_28-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc3367384746f2686f6c868b8bcaec717df604b8 --- /dev/null +++ b/global_step5494/layer_28-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f250691d3238461bd360fc70bc1dceab078ae0b7c8c35d823010919be33667e4 +size 201408771 diff --git a/global_step5494/layer_29-model_00-model_states.pt b/global_step5494/layer_29-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c77a384f5b38d5a46e4d157ae193fee678a73974 --- /dev/null +++ b/global_step5494/layer_29-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e451b64b36552fecf18683e7cabb384e250a2fcf988e9c5592f0bfb27abc9162 +size 201408771 diff --git a/global_step5494/layer_29-model_01-model_states.pt b/global_step5494/layer_29-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4da758c531f6bf020e9747759a0f4c208488e93 --- /dev/null +++ b/global_step5494/layer_29-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5362b70b3a4b5f1b7e9e53c3c6d76b9f04c34dc75b5110711bb54cb3bbaca6 +size 201408771 diff --git a/global_step5494/layer_30-model_00-model_states.pt b/global_step5494/layer_30-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c48992fb3d397c48a257cea401fec0de6d306ed --- /dev/null +++ b/global_step5494/layer_30-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e923e5b7c972d19230bc6c8ba15ec671de2bfed0930ba9e40718d22ccce2b0 +size 201408771 diff --git a/global_step5494/layer_30-model_01-model_states.pt b/global_step5494/layer_30-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70b5412690e2910bb455091684a2903836478a83 --- /dev/null +++ b/global_step5494/layer_30-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ea9a803c4100a451fd51b87418129f25d5e7bd54c58a03710ca0f51115d12f2 +size 201408771 diff --git a/global_step5494/layer_31-model_00-model_states.pt b/global_step5494/layer_31-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b72a6ea1d2edf5194caa10e4bbb77ae78baeb36 --- /dev/null +++ b/global_step5494/layer_31-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4990bae0e6841fbc2930a802cd3dc67321e3508087566e503b5da9dfbf8bdcd7 +size 201408771 diff --git a/global_step5494/layer_31-model_01-model_states.pt b/global_step5494/layer_31-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5df5b68b3be7e657abec449a83b099c06abaed7c --- /dev/null +++ b/global_step5494/layer_31-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6257317c0d4274f8feaeb21831dbcb0bf0a48777dd53edbf5f49df5e34fd38c +size 201408771 diff --git a/global_step5494/layer_32-model_00-model_states.pt b/global_step5494/layer_32-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edd2597734fbeda0e57748e5fad5eeb4ea47b3f2 --- /dev/null +++ b/global_step5494/layer_32-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f739f54ced8985eea6c043fb76d1f406aa4c58470376d7a7c3f1627337a404f6 +size 201408771 diff --git a/global_step5494/layer_32-model_01-model_states.pt b/global_step5494/layer_32-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d673cb489f297d7b63ffda30c9a2af0b7a94a750 --- /dev/null +++ b/global_step5494/layer_32-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f37bdbf9035767eded31ee9e9d253cd4273cbee9a340fdebe33a4faa0bb19e16 +size 201408771 diff --git a/global_step5494/layer_33-model_00-model_states.pt b/global_step5494/layer_33-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a81dbdb7007158d1ff6d72a5e1678f5b21ed363 --- /dev/null +++ b/global_step5494/layer_33-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9589b3a8a6977b9e144cf7389f2ccdcd14e899d9fb93bcd44715ff36c09a901 +size 201408771 diff --git a/global_step5494/layer_33-model_01-model_states.pt b/global_step5494/layer_33-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c84b0d409a04c626d5d0d337ca21417a3973e4c --- /dev/null +++ b/global_step5494/layer_33-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3b515a14f97fe1f5577cec9933822b4d1b55eaa243d15a9d50bb7ef6c765b22 +size 201408771 diff --git a/global_step5494/layer_34-model_00-model_states.pt b/global_step5494/layer_34-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..703baa8c8011edd7f5e791ff6a710db254373f8b --- /dev/null +++ b/global_step5494/layer_34-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a97c575b99de2e8ca0978846c1d4a6a61bf356667cbc84a32c7f7a3189acd402 +size 201408771 diff --git a/global_step5494/layer_34-model_01-model_states.pt b/global_step5494/layer_34-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d57d4dfc5392f5b59060bc17d39ecfb76d4e6ad5 --- /dev/null +++ b/global_step5494/layer_34-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8994581d9728a7af10e4fb701d5eb7635fa6ad64b8716eb13ed6c7a3e9163a4 +size 201408771 diff --git a/global_step5494/layer_35-model_00-model_states.pt b/global_step5494/layer_35-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6fb8fe58071e3813c586fbaaf65bdd0891a223b --- /dev/null +++ b/global_step5494/layer_35-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c7231718b973ed4b55e5455c25727f59234f55f068fa08b538b5c40a07c80c5 +size 201408771 diff --git a/global_step5494/layer_35-model_01-model_states.pt b/global_step5494/layer_35-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a84bb76469829f109a66c25a6c87e98d7651d10 --- /dev/null +++ b/global_step5494/layer_35-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d403bd96e6c6b8e0bb0d3aa370e4a75326a5710908eefe477b4eec38dc5d63b +size 201408771 diff --git a/global_step5494/layer_36-model_00-model_states.pt b/global_step5494/layer_36-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e687f7a56108a76be905b1eaa808ec44baf32e9 --- /dev/null +++ b/global_step5494/layer_36-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:298b6c6d2d8336320e3d47b2c3831218e8f583afa94a050039135dafc0e9041f +size 201408771 diff --git a/global_step5494/layer_36-model_01-model_states.pt b/global_step5494/layer_36-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1947d8b5ec62170db6569f44078ec1d409486f90 --- /dev/null +++ b/global_step5494/layer_36-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:623b08e49bc38d5bcef7c93c8d9406dee833717cc4935a7a88d6e00076b3e143 +size 201408771 diff --git a/global_step5494/layer_37-model_00-model_states.pt b/global_step5494/layer_37-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aedd60673ee1de2c3cddc8e4670fe0ea56d13b9f --- /dev/null +++ b/global_step5494/layer_37-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:739275d92e0bea7f17bcf12d1692e15cbad3df6436e8b427f76d110e3a954e09 +size 201408771 diff --git a/global_step5494/layer_37-model_01-model_states.pt b/global_step5494/layer_37-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..47b8c775861ceff3e23b0edd2096b9f5f9508f16 --- /dev/null +++ b/global_step5494/layer_37-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad10320f78a20213660d295dfa8c391b0deb98581caab558455b89887a53140d +size 201408771 diff --git a/global_step5494/layer_38-model_00-model_states.pt b/global_step5494/layer_38-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8272ffcff79b71faf3570a1e05fc56a49b6aef8f --- /dev/null +++ b/global_step5494/layer_38-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bad2ce8dce5b6d98d8eed145c08d587921da285fec5620367e272bd0a7cca37c +size 201408771 diff --git a/global_step5494/layer_38-model_01-model_states.pt b/global_step5494/layer_38-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..811e16afa61bafe60b4366f393398831fa03880b --- /dev/null +++ b/global_step5494/layer_38-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae13f408c5e89ff67367de21a50b817f7b752efa2dbab154d535eda9bb5e6265 +size 201408771 diff --git a/global_step5494/layer_39-model_00-model_states.pt b/global_step5494/layer_39-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c58d312969ca3bf9440b2589fd626e788587780 --- /dev/null +++ b/global_step5494/layer_39-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:057327fd5e65073d140700c2fe4aeb419e9c4b98badbe304d61448d6884dd106 +size 201408771 diff --git a/global_step5494/layer_39-model_01-model_states.pt b/global_step5494/layer_39-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4d15ddb08a5de52ab434863afa0aef9f2508b08 --- /dev/null +++ b/global_step5494/layer_39-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba2dc3a963d02e95c74732425cb0554906df762c835351813a58150256e66791 +size 201408771 diff --git a/global_step5494/layer_40-model_00-model_states.pt b/global_step5494/layer_40-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0af447c7c17691152b6da5628fc086193053e6cb --- /dev/null +++ b/global_step5494/layer_40-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c2b9d6c3f69b35b67186be62947d78b476d0c42c9cbcc83d1b439e791b97818 +size 201408771 diff --git a/global_step5494/layer_40-model_01-model_states.pt b/global_step5494/layer_40-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f6a62ed6725ff759b76f4ed0ba545f2271fb9f3 --- /dev/null +++ b/global_step5494/layer_40-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84c99d0b3b746e2225f65f702ac4df85963d1ebc7c0fb5a1927f1778145387cc +size 201408771 diff --git a/global_step5494/layer_41-model_00-model_states.pt b/global_step5494/layer_41-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba435dfdd5160e422f40c77f39ac84c47bf3f909 --- /dev/null +++ b/global_step5494/layer_41-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10418c36694ea6c32851aad81fff19130b729fe568987f543f6521b387f94f09 +size 201408771 diff --git a/global_step5494/layer_41-model_01-model_states.pt b/global_step5494/layer_41-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..838ecdca38e463094cb86f655ee590c9c2e47616 --- /dev/null +++ b/global_step5494/layer_41-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:294a5fcc2e2442cad7869363c07925815833bd95f84df6f582ba710b5a8e9c37 +size 201408771 diff --git a/global_step5494/layer_42-model_00-model_states.pt b/global_step5494/layer_42-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75f8ebbb716a37897e924c4886fce0ec41e50ced --- /dev/null +++ b/global_step5494/layer_42-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bdf258ad73aca85f536b5b6a3be14184799923b09f665b62c41f39e43472fc1 +size 201408771 diff --git a/global_step5494/layer_42-model_01-model_states.pt b/global_step5494/layer_42-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cc26cb9fa1650fb9a1615f27396608524ee8fb8 --- /dev/null +++ b/global_step5494/layer_42-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e68c5bc42bea63d84b573b2bc786f01dc9ddb67094cf56c74b965f80fa10d99 +size 201408771 diff --git a/global_step5494/layer_43-model_00-model_states.pt b/global_step5494/layer_43-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fa010c7840d9242f9f3b794cc11ce3cb7c8742c --- /dev/null +++ b/global_step5494/layer_43-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6235035a89004a23c3f7ef2c141284a3b4e3d2fb8d679f21019e462de72b5013 +size 201408771 diff --git a/global_step5494/layer_43-model_01-model_states.pt b/global_step5494/layer_43-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d20d7709db428d80dece643f71bc2e4dbf224a8 --- /dev/null +++ b/global_step5494/layer_43-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e7be7606f6aad92234bd1bb252508fd1de4f538220514905a10f48b17d21922 +size 201408771 diff --git a/global_step5494/layer_44-model_00-model_states.pt b/global_step5494/layer_44-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..49565fcf34fb314c9f83a80e6847fdf69d823afd --- /dev/null +++ b/global_step5494/layer_44-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a41b1bdabd02f2dc6b7fa4b0ccf52c6720efd8d35b6f01050dbede8b27bc9e3 +size 201408771 diff --git a/global_step5494/layer_44-model_01-model_states.pt b/global_step5494/layer_44-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63a94d6ccc9895f5a56f1ed756568fa68f9da6d1 --- /dev/null +++ b/global_step5494/layer_44-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf387d10e5224c978928898fd8038c6526fbce7552dcbd5970128463e0e1fc8b +size 201408771 diff --git a/global_step5494/layer_46-model_00-model_states.pt b/global_step5494/layer_46-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4f5dfdc88569bd1cd52764ef72efd5d85bc0372 --- /dev/null +++ b/global_step5494/layer_46-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826c6c562fb2634639d44b6c4f91000905f3dc36e933feea370e4443101d8589 +size 17603 diff --git a/global_step5494/layer_46-model_01-model_states.pt b/global_step5494/layer_46-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b6ee931d8ca32e069672ec20ba5e2ef84e47653 --- /dev/null +++ b/global_step5494/layer_46-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dac13eadc99c88808e28295640a33d17032abf01430b36e827c595384b0aa04 +size 17603 diff --git a/global_step5494/mp_rank_00_model_states.pt b/global_step5494/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..120a989e30b3ff7946133749d306798110c4f180 --- /dev/null +++ b/global_step5494/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4c4be575c43f6050c35ab958c5f7dc9054102841d677cb8e2a9a8c2588f33fe +size 39603 diff --git a/global_step5494/mp_rank_01_model_states.pt b/global_step5494/mp_rank_01_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a4bb2735ce8fcfdc8963edb4832d9a7a8dcff07 --- /dev/null +++ b/global_step5494/mp_rank_01_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73e5f7ae86fbf82b9cc032fe28758a1620163fd61bb41a11b11f2b791a5801a5 +size 39603 diff --git a/global_step5494/mp_rank_02_model_states.pt b/global_step5494/mp_rank_02_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..665cf9bd1684461a0bb21e3ee1cf520531081f6a --- /dev/null +++ b/global_step5494/mp_rank_02_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d363d7290d0df024a3d0ba974a349b9d4ab5b55679bbb8ae05abb9a337e4b52 +size 39731 diff --git a/global_step5494/mp_rank_03_model_states.pt b/global_step5494/mp_rank_03_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbb06373dfbaedc85e01ed312893a01065d2eaf8 --- /dev/null +++ b/global_step5494/mp_rank_03_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f60565619b02390a5a89fe0fc589826e1850cab489b3bc04ad16eeb665c4f674 +size 39731 diff --git a/launch.sh b/launch.sh new file mode 100755 index 0000000000000000000000000000000000000000..6c3445845a5dae614feb4b8a1206322cd7bd9050 --- /dev/null +++ b/launch.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Launch script using torch.distributed.run(). Used by slurm +# scripts, don't invoke directly. + +# Samuel's fix for apparent error in SLURM initialization +if [ $SLURM_LOCALID -eq 0 ]; then + rm -rf /dev/shm/* + rocm-smi || true +else + sleep 2 +fi + +export NCCL_SOCKET_IFNAME=hsn0,hsn1,hsn2,hsn3 +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK +export FI_CXI_DEFAULT_CQ_SIZE=131072 + +# debugging (noisy) +#export NCCL_DEBUG=INFO +#export RCCL_KERNEL_COLL_TRACE_ENABLE=1 +#export NCCL_DEBUG_SUBSYS=INIT,COLL + +module --quiet purge +module load cray-python + +module load CrayEnv +module load PrgEnv-cray/8.3.3 +module load craype-accel-amd-gfx90a +module load cray-python + +module use /pfs/lustrep2/projappl/project_462000125/samantao-public/mymodules +module load suse-repo-deps/sam-default +module load rocm/sam-5.2.3.lua +module load rccl/sam-develop.lua +module load aws-ofi-rccl/sam-default.lua + +source venv/bin/activate + +MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +MASTER_PORT=9999 + +echo "Launching on $SLURMD_NODENAME ($SLURM_PROCID/$SLURM_JOB_NUM_NODES)," \ + "master $MASTER_NODE port $MASTER_PORT," \ + "GPUs $SLURM_GPUS_ON_NODE," \ + "CUDA: $(python -c 'import torch; print(torch.cuda.is_available())')" + +python -u -m torch.distributed.run \ + --nnodes $SLURM_JOB_NUM_NODES \ + --nproc_per_node $SLURM_GPUS_ON_NODE \ + --node_rank=$SLURM_PROCID \ + --master_addr $MASTER_NODE \ + --master_port $MASTER_PORT \ + "$@" diff --git a/logs/2098877.err b/logs/2098877.err new file mode 100644 index 0000000000000000000000000000000000000000..4ce5dc4b322c2185ebe90a8d08b3263e9aea29db --- /dev/null +++ b/logs/2098877.err @@ -0,0 +1,3625 @@ + 0: Successfully preprocessed all matching files. + 0: Detected CUDA files, patching ldflags + 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... + 0: Building extension module scaled_upper_triang_masked_softmax_cuda... + 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) + 0: Loading extension module scaled_upper_triang_masked_softmax_cuda... + 0: Successfully preprocessed all matching files. + 0: Detected CUDA files, patching ldflags + 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... + 0: Building extension module scaled_masked_softmax_cuda... + 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) + 0: Loading extension module scaled_masked_softmax_cuda... + 0: Successfully preprocessed all matching files. + 0: Detected CUDA files, patching ldflags + 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... + 0: Building extension module fused_mix_prec_layer_norm_cuda... + 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) + 0: Loading extension module fused_mix_prec_layer_norm_cuda... + 0: Successfully preprocessed all matching files. + 0: Successfully preprocessed all matching files. + 0: Successfully preprocessed all matching files. +63: Successfully preprocessed all matching files. +63: Successfully preprocessed all matching files. +63: Successfully preprocessed all matching files. +49: Successfully preprocessed all matching files. +49: Successfully preprocessed all matching files. +49: Successfully preprocessed all matching files. +33: Successfully preprocessed all matching files. +19: Successfully preprocessed all matching files. +47: Successfully preprocessed all matching files. +15: Successfully preprocessed all matching files. + 0: Successfully preprocessed all matching files. +37: Successfully preprocessed all matching files. +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +56: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +56: warnings.warn( +59: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +59: warnings.warn( +59: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +59: warnings.warn( +56: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +56: warnings.warn( +56: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +56: warnings.warn( +56: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +56: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +59: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +59: warnings.warn( +56: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +56: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +36: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +36: warnings.warn( +36: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +36: warnings.warn( +36: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +36: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +59: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +59: warnings.warn( +59: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +59: warnings.warn( +56: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +56: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +36: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +36: warnings.warn( +37: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +37: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +36: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +36: warnings.warn( +37: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +37: warnings.warn( +37: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +37: warnings.warn( +59: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +59: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( +48: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +48: warnings.warn( +48: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +48: warnings.warn( +48: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +48: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( +43: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +43: warnings.warn( +37: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +37: warnings.warn( +47: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +47: warnings.warn( +47: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +47: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( +48: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +48: warnings.warn( +48: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +48: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +43: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +43: warnings.warn( +43: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +43: warnings.warn( +43: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +43: warnings.warn( +37: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +37: warnings.warn( +52: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +52: warnings.warn( +52: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +52: warnings.warn( +52: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +52: warnings.warn( +47: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +47: warnings.warn( +47: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +47: warnings.warn( +47: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +47: warnings.warn( +48: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +48: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +43: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +43: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +32: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +32: warnings.warn( +32: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +32: warnings.warn( +37: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +37: warnings.warn( +52: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +52: warnings.warn( +52: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +52: warnings.warn( +45: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +45: warnings.warn( +45: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +45: warnings.warn( +45: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +45: warnings.warn( +47: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +47: warnings.warn( +32: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +32: warnings.warn( +32: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +32: warnings.warn( +52: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +52: warnings.warn( +45: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +45: warnings.warn( +45: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +45: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +32: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +32: warnings.warn( +45: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +45: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( +32: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +32: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +35: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +35: warnings.warn( +35: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +35: warnings.warn( +35: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +35: warnings.warn( +35: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +35: warnings.warn( +35: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +35: warnings.warn( +35: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +35: warnings.warn( +55: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +55: warnings.warn( +55: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +55: warnings.warn( +55: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +55: warnings.warn( +55: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +55: warnings.warn( +55: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +55: warnings.warn( +55: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +55: warnings.warn( +49: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +49: warnings.warn( +49: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +49: warnings.warn( +49: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +49: warnings.warn( +49: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +49: warnings.warn( +49: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +49: warnings.warn( +49: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +49: warnings.warn( +53: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +53: warnings.warn( +53: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +53: warnings.warn( +53: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +53: warnings.warn( +53: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +53: warnings.warn( +53: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +53: warnings.warn( +53: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +53: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +61: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +61: warnings.warn( +61: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +61: warnings.warn( +61: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +61: warnings.warn( +61: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +61: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +61: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +61: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +39: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +39: warnings.warn( +39: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +39: warnings.warn( +39: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +39: warnings.warn( +39: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +39: warnings.warn( +39: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +39: warnings.warn( +39: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +39: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( +33: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +33: warnings.warn( +33: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +33: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +33: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +33: warnings.warn( +33: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +33: warnings.warn( +33: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +33: warnings.warn( +33: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +33: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +61: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +61: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +54: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +54: warnings.warn( +54: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +54: warnings.warn( +54: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +54: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +54: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +54: warnings.warn( +54: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +54: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +51: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +51: warnings.warn( +51: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +51: warnings.warn( +51: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +51: warnings.warn( +40: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +40: warnings.warn( +40: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +40: warnings.warn( +40: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +40: warnings.warn( +51: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +51: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +40: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +40: warnings.warn( +51: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +51: warnings.warn( +40: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +40: warnings.warn( +40: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +40: warnings.warn( +54: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +54: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +34: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +34: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +51: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +51: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +34: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +34: warnings.warn( +34: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +34: warnings.warn( +34: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +34: warnings.warn( +34: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +34: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +38: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +38: warnings.warn( +38: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +38: warnings.warn( +38: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +38: warnings.warn( +38: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +38: warnings.warn( +38: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +38: warnings.warn( +34: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +34: warnings.warn( +38: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +38: warnings.warn( +44: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +44: warnings.warn( +44: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +44: warnings.warn( +44: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +44: warnings.warn( +50: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +50: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +44: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +44: warnings.warn( +44: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +44: warnings.warn( +50: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +50: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +50: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +50: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +44: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +44: warnings.warn( +50: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +50: warnings.warn( +50: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +50: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +57: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +57: warnings.warn( +57: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +57: warnings.warn( +57: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +57: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( +50: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +50: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( +57: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +57: warnings.warn( +57: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +57: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +57: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +57: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( +42: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +42: warnings.warn( +42: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +42: warnings.warn( +42: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +42: warnings.warn( +42: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +42: warnings.warn( +42: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +42: warnings.warn( +63: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +63: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +63: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +63: warnings.warn( +63: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +63: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +63: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +63: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +63: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +63: warnings.warn( +42: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +42: warnings.warn( +46: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +46: warnings.warn( +46: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +46: warnings.warn( +46: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +46: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +63: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +63: warnings.warn( +46: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +46: warnings.warn( +46: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +46: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +41: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +41: warnings.warn( +41: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +41: warnings.warn( +41: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +41: warnings.warn( +41: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +41: warnings.warn( +41: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +41: warnings.warn( +58: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +58: warnings.warn( +58: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +58: warnings.warn( +58: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +58: warnings.warn( +58: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +58: warnings.warn( +58: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +58: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +46: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +46: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +41: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +41: warnings.warn( +60: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +60: warnings.warn( +60: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +60: warnings.warn( +58: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +58: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +62: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +62: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +62: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +62: warnings.warn( +62: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +62: warnings.warn( +62: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +62: warnings.warn( +62: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +62: warnings.warn( +60: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +60: warnings.warn( +60: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +60: warnings.warn( +60: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +60: warnings.warn( +62: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +62: warnings.warn( +60: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +60: warnings.warn( +36: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +36: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +43: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +43: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( +39: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +39: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +45: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +45: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +59: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +59: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +37: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +37: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +42: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +42: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +59: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +59: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +32: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +32: warnings.warn( +32: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +32: warnings.warn( +57: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +57: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +52: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +52: warnings.warn( +57: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +57: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +48: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +48: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +40: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +40: warnings.warn( +40: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +40: warnings.warn( +52: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +52: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( +35: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +35: warnings.warn( +54: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +54: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( +36: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +36: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( +50: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +50: warnings.warn( +36: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +36: warnings.warn( +61: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +61: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +47: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +47: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +42: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +42: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( +58: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +58: warnings.warn( +39: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +39: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +48: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +48: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +45: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +45: warnings.warn( +34: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +34: warnings.warn( +56: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +56: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( +56: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +56: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( +62: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +62: warnings.warn( +50: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +50: warnings.warn( +35: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +35: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +33: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +33: warnings.warn( +58: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +58: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +41: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +41: warnings.warn( +54: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +54: warnings.warn( +34: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +34: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +41: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +41: warnings.warn( +60: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +60: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( +62: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +62: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +55: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +55: warnings.warn( +55: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +55: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +49: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +49: warnings.warn( +51: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +51: warnings.warn( +60: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +60: warnings.warn( +44: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +44: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +46: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +46: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( +38: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +38: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +46: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +46: warnings.warn( +44: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +44: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +49: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +49: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( +38: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +38: warnings.warn( +47: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +47: warnings.warn( +33: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +33: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +37: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +37: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +63: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +63: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +63: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +63: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +53: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +53: warnings.warn( +51: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +51: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +53: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +53: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +43: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +43: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +43: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +43: warnings.warn( +61: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +61: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: + 0: + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: + 1: + 1: + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: + 2: + 2: + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: + 3: + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: + 4: + 4: + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: + 5: + 5: + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: + 7: + 7: + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: + 6: + 6: +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: + 8: + 8: + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: + 9: + 9: +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: +10: +10: +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: +11: +11: +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: +12: +12: +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: +13: +13: +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: +14: +14: +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: +16: +16: +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: +17: +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: +18: +18: +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: +20: +20: +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: +21: +21: +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: +22: +22: +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: +23: +23: +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: +24: +24: +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: +25: +25: + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: +33: +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +36: +36: +36: +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +38: +38: +38: +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: +39: +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +49: +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: +41: +41: +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: +42: +43: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: +43: +43: +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: +44: +44: +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: +45: +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: +46: +46: +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: +48: +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: +50: +50: +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: +51: +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: +52: +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +53: +53: +53: +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: +54: +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: +57: +57: +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: +56: +56: + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: +58: +58: +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: +60: +60: +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: +61: + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: +62: +62: + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: +28: +28: +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Building extension module utils... +26: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +36: +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Loading extension module utils... +17: Loading extension module utils... +26: Loading extension module utils... +18: Loading extension module utils... +17: Loading extension module utils... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Loading extension module utils... +17: Loading extension module utils... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +53: +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +36: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +36: Building extension module utils... +36: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +36: Loading extension module utils... +35: Loading extension module utils... +34: Loading extension module utils... + 0: Loading extension module utils... + 0: Loading extension module utils... + 0: Loading extension module utils... + 0: Loading extension module utils... +55: Loading extension module utils... +59: Loading extension module utils... + 2: Loading extension module utils... + 2: Loading extension module utils... + 2: Loading extension module utils... + 2: Loading extension module utils... + 1: Loading extension module utils... + 4: Loading extension module utils... + 1: Loading extension module utils... + 3: Loading extension module utils... + 5: Loading extension module utils... + 4: Loading extension module utils... + 1: Loading extension module utils... + 4: Loading extension module utils... + 5: Loading extension module utils... + 1: Loading extension module utils... + 4: Loading extension module utils... + 5: Loading extension module utils... + 5: Loading extension module utils... + 3: Loading extension module utils... + 3: Loading extension module utils... + 7: Loading extension module utils... + 6: Loading extension module utils... + 3: Loading extension module utils... + 7: Loading extension module utils... + 6: Loading extension module utils... + 7: Loading extension module utils... +15: Loading extension module utils... + 6: Loading extension module utils... + 7: Loading extension module utils... + 6: Loading extension module utils... + 8: Loading extension module utils... +15: Loading extension module utils... + 8: Loading extension module utils... +15: Loading extension module utils... + 8: Loading extension module utils... +15: Loading extension module utils... +10: Loading extension module utils... + 8: Loading extension module utils... + 9: Loading extension module utils... +32: Loading extension module utils... +10: Loading extension module utils... + 9: Loading extension module utils... +32: Loading extension module utils... +10: Loading extension module utils... +32: Loading extension module utils... + 9: Loading extension module utils... +10: Loading extension module utils... +33: Loading extension module utils... + 9: Loading extension module utils... +32: Loading extension module utils... +11: Loading extension module utils... +19: Loading extension module utils... +11: Loading extension module utils... +33: Loading extension module utils... +35: Loading extension module utils... +33: Loading extension module utils... +11: Loading extension module utils... +12: Loading extension module utils... +19: Loading extension module utils... +11: Loading extension module utils... +13: Loading extension module utils... +19: Loading extension module utils... +14: Loading extension module utils... +12: Loading extension module utils... +19: Loading extension module utils... +13: Loading extension module utils... +12: Loading extension module utils... +14: Loading extension module utils... +13: Loading extension module utils... +12: Loading extension module utils... +13: Loading extension module utils... +14: Loading extension module utils... +14: Loading extension module utils... +16: Loading extension module utils... +16: Loading extension module utils... +16: Loading extension module utils... +16: Loading extension module utils... +36: Loading extension module utils... +17: Loading extension module utils... +18: Loading extension module utils... +33: Loading extension module utils... +18: Loading extension module utils... +36: Loading extension module utils... +20: Loading extension module utils... +38: Loading extension module utils... +20: Loading extension module utils... +35: Loading extension module utils... +38: Loading extension module utils... +21: Loading extension module utils... +20: Loading extension module utils... +37: Loading extension module utils... +35: Loading extension module utils... +38: Loading extension module utils... +37: Loading extension module utils... +34: Loading extension module utils... +20: Loading extension module utils... +35: Loading extension module utils... +37: Loading extension module utils... +39: Loading extension module utils... +22: Loading extension module utils... +34: Loading extension module utils... +34: Loading extension module utils... +21: Loading extension module utils... +24: Loading extension module utils... +21: Loading extension module utils... + 0: Loading extension module utils... +34: Loading extension module utils... +41: Loading extension module utils... +21: Loading extension module utils... + 0: Loading extension module utils... +24: Loading extension module utils... +39: Loading extension module utils... +41: Loading extension module utils... +24: Loading extension module utils... +41: Loading extension module utils... +40: Loading extension module utils... +39: Loading extension module utils... +23: Loading extension module utils... +24: Loading extension module utils... +25: Loading extension module utils... +40: Loading extension module utils... +22: Loading extension module utils... +42: Loading extension module utils... +40: Loading extension module utils... +22: Loading extension module utils... + 1: Loading extension module utils... +42: Loading extension module utils... +43: Loading extension module utils... +25: Loading extension module utils... +44: Loading extension module utils... +22: Loading extension module utils... +43: Loading extension module utils... +23: Loading extension module utils... +42: Loading extension module utils... +25: Loading extension module utils... +45: Loading extension module utils... +23: Loading extension module utils... +44: Loading extension module utils... +25: Loading extension module utils... + 1: Loading extension module utils... +45: Loading extension module utils... +43: Loading extension module utils... +23: Loading extension module utils... +44: Loading extension module utils... + 1: Loading extension module utils... +45: Loading extension module utils... +47: Loading extension module utils... +47: Loading extension module utils... + 2: Loading extension module utils... +47: Loading extension module utils... + 3: Loading extension module utils... + 3: Loading extension module utils... + 2: Loading extension module utils... + 3: Loading extension module utils... + 4: Loading extension module utils... + 2: Loading extension module utils... + 3: Loading extension module utils... + 2: Loading extension module utils... + 4: Loading extension module utils... +49: Loading extension module utils... +49: Loading extension module utils... + 5: Loading extension module utils... + 4: Loading extension module utils... +46: Loading extension module utils... + 5: Loading extension module utils... + 6: Loading extension module utils... + 7: Loading extension module utils... +46: Loading extension module utils... + 5: Loading extension module utils... + 6: Loading extension module utils... + 7: Loading extension module utils... +49: Loading extension module utils... + 6: Loading extension module utils... + 7: Loading extension module utils... +46: Loading extension module utils... +46: Loading extension module utils... +48: Loading extension module utils... +52: Loading extension module utils... +48: Loading extension module utils... +50: Loading extension module utils... +48: Loading extension module utils... +50: Loading extension module utils... +52: Loading extension module utils... +52: Loading extension module utils... +50: Loading extension module utils... + 8: Loading extension module utils... +52: Loading extension module utils... + 9: Loading extension module utils... +51: Loading extension module utils... + 8: Loading extension module utils... + 8: Loading extension module utils... +55: Loading extension module utils... +51: Loading extension module utils... + 9: Loading extension module utils... +10: Loading extension module utils... + 8: Loading extension module utils... + 9: Loading extension module utils... +51: Loading extension module utils... +53: Loading extension module utils... +53: Loading extension module utils... +55: Loading extension module utils... +10: Loading extension module utils... +54: Loading extension module utils... +55: Loading extension module utils... +10: Loading extension module utils... +10: Loading extension module utils... +53: Loading extension module utils... +54: Loading extension module utils... +55: Loading extension module utils... +54: Loading extension module utils... +11: Loading extension module utils... +13: Loading extension module utils... +11: Loading extension module utils... +54: Loading extension module utils... +12: Loading extension module utils... +11: Loading extension module utils... +13: Loading extension module utils... +12: Loading extension module utils... +11: Loading extension module utils... +12: Loading extension module utils... +13: Loading extension module utils... +59: Loading extension module utils... +56: Loading extension module utils... +14: Loading extension module utils... +59: Loading extension module utils... +15: Loading extension module utils... +14: Loading extension module utils... +56: Loading extension module utils... +14: Loading extension module utils... +59: Loading extension module utils... +57: Loading extension module utils... +56: Loading extension module utils... +59: Loading extension module utils... +56: Loading extension module utils... +15: Loading extension module utils... +57: Loading extension module utils... +26: Loading extension module utils... +15: Loading extension module utils... +57: Loading extension module utils... +15: Loading extension module utils... +26: Loading extension module utils... + 0: Loading extension module utils... +60: Loading extension module utils... +58: Loading extension module utils... +60: Loading extension module utils... +17: Loading extension module utils... +16: Loading extension module utils... +17: Loading extension module utils... +18: Loading extension module utils... +16: Loading extension module utils... +60: Loading extension module utils... +58: Loading extension module utils... +27: Loading extension module utils... +17: Loading extension module utils... +16: Loading extension module utils... +18: Loading extension module utils... +19: Loading extension module utils... +58: Loading extension module utils... +18: Loading extension module utils... +27: Loading extension module utils... +58: Loading extension module utils... +62: Loading extension module utils... +27: Loading extension module utils... +19: Loading extension module utils... +63: Loading extension module utils... +62: Loading extension module utils... +62: Loading extension module utils... +61: Loading extension module utils... +19: Loading extension module utils... +63: Loading extension module utils... +62: Loading extension module utils... +61: Loading extension module utils... +29: Loading extension module utils... +19: Loading extension module utils... +63: Loading extension module utils... +61: Loading extension module utils... +20: Loading extension module utils... +61: Loading extension module utils... +20: Loading extension module utils... +29: Loading extension module utils... +20: Loading extension module utils... +29: Loading extension module utils... +20: Loading extension module utils... +29: Loading extension module utils... +21: Loading extension module utils... +30: Loading extension module utils... +23: Loading extension module utils... +31: Loading extension module utils... +23: Loading extension module utils... +22: Loading extension module utils... +23: Loading extension module utils... +30: Loading extension module utils... +22: Loading extension module utils... +31: Loading extension module utils... +23: Loading extension module utils... +26: Loading extension module utils... +30: Loading extension module utils... +21: Loading extension module utils... +22: Loading extension module utils... +31: Loading extension module utils... +21: Loading extension module utils... +30: Loading extension module utils... +21: Loading extension module utils... +31: Loading extension module utils... +24: Loading extension module utils... +26: Loading extension module utils... +27: Loading extension module utils... +26: Loading extension module utils... +27: Loading extension module utils... +24: Loading extension module utils... +26: Loading extension module utils... +24: Loading extension module utils... +27: Loading extension module utils... +24: Loading extension module utils... +25: Loading extension module utils... +27: Loading extension module utils... +25: Loading extension module utils... +25: Loading extension module utils... +30: Loading extension module utils... +29: Loading extension module utils... +30: Loading extension module utils... +30: Loading extension module utils... +29: Loading extension module utils... +31: Loading extension module utils... +29: Loading extension module utils... +31: Loading extension module utils... +31: Loading extension module utils... +28: Loading extension module utils... +28: Loading extension module utils... +28: Loading extension module utils... +28: Loading extension module utils... +28: Loading extension module utils... +28: Loading extension module utils... +28: Loading extension module utils... +28: Loading extension module utils... +32: Loading extension module utils... +32: Loading extension module utils... +32: Loading extension module utils... +34: Loading extension module utils... +35: Loading extension module utils... +34: Loading extension module utils... +35: Loading extension module utils... +34: Loading extension module utils... +35: Loading extension module utils... +33: Loading extension module utils... +33: Loading extension module utils... +33: Loading extension module utils... +33: Loading extension module utils... +37: Loading extension module utils... +37: Loading extension module utils... +37: Loading extension module utils... +37: Loading extension module utils... +36: Loading extension module utils... +36: Loading extension module utils... +36: Loading extension module utils... +36: Loading extension module utils... +47: Loading extension module utils... +47: Loading extension module utils... +38: Loading extension module utils... +47: Loading extension module utils... +38: Loading extension module utils... +47: Loading extension module utils... +38: Loading extension module utils... +38: Loading extension module utils... +40: Loading extension module utils... +40: Loading extension module utils... +40: Loading extension module utils... +39: Loading extension module utils... +40: Loading extension module utils... +39: Loading extension module utils... +39: Loading extension module utils... +49: Loading extension module utils... +39: Loading extension module utils... +49: Loading extension module utils... +49: Loading extension module utils... +49: Loading extension module utils... +55: Loading extension module utils... +41: Loading extension module utils... +42: Loading extension module utils... +41: Loading extension module utils... +55: Loading extension module utils... +42: Loading extension module utils... +44: Loading extension module utils... +41: Loading extension module utils... +42: Loading extension module utils... +43: Loading extension module utils... +41: Loading extension module utils... +42: Loading extension module utils... +55: Loading extension module utils... +43: Loading extension module utils... +44: Loading extension module utils... +43: Loading extension module utils... +44: Loading extension module utils... +43: Loading extension module utils... +44: Loading extension module utils... +45: Loading extension module utils... +45: Loading extension module utils... +45: Loading extension module utils... +46: Loading extension module utils... +46: Loading extension module utils... +46: Loading extension module utils... +46: Loading extension module utils... +45: Loading extension module utils... +37: Loading extension module utils... +59: Loading extension module utils... +48: Loading extension module utils... +48: Loading extension module utils... +36: Loading extension module utils... +48: Loading extension module utils... +59: Loading extension module utils... +48: Loading extension module utils... +50: Loading extension module utils... +59: Loading extension module utils... +38: Loading extension module utils... +50: Loading extension module utils... +51: Loading extension module utils... +50: Loading extension module utils... +51: Loading extension module utils... +50: Loading extension module utils... +51: Loading extension module utils... +52: Loading extension module utils... +51: Loading extension module utils... +52: Loading extension module utils... + 0: Loading extension module utils... +39: Loading extension module utils... +40: Loading extension module utils... +52: Loading extension module utils... +52: Loading extension module utils... +53: Loading extension module utils... +41: Loading extension module utils... +53: Loading extension module utils... +53: Loading extension module utils... +53: Loading extension module utils... +45: Loading extension module utils... + 1: Loading extension module utils... +54: Loading extension module utils... +54: Loading extension module utils... +43: Loading extension module utils... +42: Loading extension module utils... +54: Loading extension module utils... +63: Loading extension module utils... +54: Loading extension module utils... +63: Loading extension module utils... +63: Loading extension module utils... +44: Loading extension module utils... +63: Loading extension module utils... +57: Loading extension module utils... +57: Loading extension module utils... + 4: Loading extension module utils... +57: Loading extension module utils... +47: Loading extension module utils... +57: Loading extension module utils... +56: Loading extension module utils... + 5: Loading extension module utils... +56: Loading extension module utils... +56: Loading extension module utils... +58: Loading extension module utils... +56: Loading extension module utils... +58: Loading extension module utils... +49: Loading extension module utils... +58: Loading extension module utils... +60: Loading extension module utils... +48: Loading extension module utils... +58: Loading extension module utils... + 7: Loading extension module utils... + 6: Loading extension module utils... +60: Loading extension module utils... +61: Loading extension module utils... +60: Loading extension module utils... +61: Loading extension module utils... +60: Loading extension module utils... +61: Loading extension module utils... +62: Loading extension module utils... +61: Loading extension module utils... +62: Loading extension module utils... +62: Loading extension module utils... +50: Loading extension module utils... +62: Loading extension module utils... + 9: Loading extension module utils... +51: Loading extension module utils... +12: Loading extension module utils... +53: Loading extension module utils... +14: Loading extension module utils... +13: Loading extension module utils... +57: Loading extension module utils... +16: Loading extension module utils... +17: Loading extension module utils... +18: Loading extension module utils... +60: Loading extension module utils... +63: Loading extension module utils... +22: Loading extension module utils... +25: Loading extension module utils... +32: Loading extension module utils... +29: Loading extension module utils... +30: Loading extension module utils... +31: Loading extension module utils... +27: Loading extension module utils... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: + 0: + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 0: + 0: Loading extension module utils... + 0: Loading extension module utils... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: Loading extension module utils... +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: Loading extension module utils... +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: Loading extension module utils... +19: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: +19: Loading extension module utils...Loading extension module utils... +19: Loading extension module utils... +19: +19: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +19: +19: Loading extension module utils...Loading extension module utils... +19: +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +50: No modifications detected for re-loaded extension module utils, skipping build step... +50: Loading extension module utils... +50: No modifications detected for re-loaded extension module utils, skipping build step... +50: Loading extension module utils... +50: No modifications detected for re-loaded extension module utils, skipping build step... +50: Loading extension module utils... +50: No modifications detected for re-loaded extension module utils, skipping build step... +50: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +50: +50: Loading extension module utils... +50: No modifications detected for re-loaded extension module utils, skipping build step... +50: Loading extension module utils... +50: No modifications detected for re-loaded extension module utils, skipping build step... +50: Loading extension module utils... +50: No modifications detected for re-loaded extension module utils, skipping build step... +50: Loading extension module utils... +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: +44: +44: +44: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +44: No modifications detected for re-loaded extension module utils, skipping build step... +44: Loading extension module utils... +44: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +44: +44: Loading extension module utils...Loading extension module utils... +44: +44: No modifications detected for re-loaded extension module utils, skipping build step... +44: Loading extension module utils... +44: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +44: +44: Loading extension module utils... +44: Loading extension module utils... +44: No modifications detected for re-loaded extension module utils, skipping build step... +44: Loading extension module utils... +44: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: +44: Loading extension module utils... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: No modifications detected for re-loaded extension module utils, skipping build step... +15: Loading extension module utils... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: No modifications detected for re-loaded extension module utils, skipping build step... +15: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... +32: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +32: No modifications detected for re-loaded extension module utils, skipping build step... +32: Loading extension module utils... +32: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +32: +32: +32: +32: +32: +32: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +32: +32: +32: +32: Loading extension module utils... +32: +32: No modifications detected for re-loaded extension module utils, skipping build step... +15: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +15: +15: Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +15: +15: +32: Loading extension module utils... +15: Loading extension module utils... +15: No modifications detected for re-loaded extension module utils, skipping build step... +15: Loading extension module utils... +15: No modifications detected for re-loaded extension module utils, skipping build step... +15: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: No modifications detected for re-loaded extension module utils, skipping build step... +15: Loading extension module utils... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... +43: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: +43: + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: +43: +14: No modifications detected for re-loaded extension module utils, skipping build step... +14: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: No modifications detected for re-loaded extension module utils, skipping build step... +14: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +14: +14: Loading extension module utils... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: No modifications detected for re-loaded extension module utils, skipping build step... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +14: +14: Loading extension module utils... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: No modifications detected for re-loaded extension module utils, skipping build step... +14: Loading extension module utils... +43: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +43: +43: Loading extension module utils...Loading extension module utils... +43: +43: No modifications detected for re-loaded extension module utils, skipping build step... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: No modifications detected for re-loaded extension module utils, skipping build step... +14: Loading extension module utils... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: Loading extension module utils... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: No modifications detected for re-loaded extension module utils, skipping build step... +14: Loading extension module utils... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: + 9: +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: No modifications detected for re-loaded extension module utils, skipping build step... +43: Loading extension module utils... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: No modifications detected for re-loaded extension module utils, skipping build step... + 9: Loading extension module utils... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +43: No modifications detected for re-loaded extension module utils, skipping build step... +43: Loading extension module utils... + 9: No modifications detected for re-loaded extension module utils, skipping build step... + 9: Loading extension module utils... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: + 9: No modifications detected for re-loaded extension module utils, skipping build step... + 9: Loading extension module utils... +43: No modifications detected for re-loaded extension module utils, skipping build step... +43: Loading extension module utils... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... +43: No modifications detected for re-loaded extension module utils, skipping build step... +43: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +43: +43: Loading extension module utils... + 9: No modifications detected for re-loaded extension module utils, skipping build step... + 9: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + 9: + 9: Loading extension module utils... +17: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... +17: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... + 9: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 9: + 9: Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... + 9: + 9: + 9: Loading extension module utils... +17: +17: +17: Loading extension module utils... +23: No modifications detected for re-loaded extension module utils, skipping build step... +23: Loading extension module utils... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... + 6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 6: + 6: Loading extension module utils...Loading extension module utils... + 6: + 6: No modifications detected for re-loaded extension module utils, skipping build step... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... + 6: Loading extension module utils... + 6: No modifications detected for re-loaded extension module utils, skipping build step... + 6: Loading extension module utils... + 6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 6: + 6: Loading extension module utils...Loading extension module utils... + 6: +23: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +23: +23: Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +23: + 6: No modifications detected for re-loaded extension module utils, skipping build step... + 6: Loading extension module utils... + 6: No modifications detected for re-loaded extension module utils, skipping build step... + 6: Loading extension module utils... +23: +23: Loading extension module utils... +23: No modifications detected for re-loaded extension module utils, skipping build step... +23: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +23: +23: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +23: +23: Loading extension module utils... +23: No modifications detected for re-loaded extension module utils, skipping build step... +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: +48: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: No modifications detected for re-loaded extension module utils, skipping build step... +48: Loading extension module utils... +48: No modifications detected for re-loaded extension module utils, skipping build step... +48: Loading extension module utils... +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +48: No modifications detected for re-loaded extension module utils, skipping build step... +48: Loading extension module utils... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: +33: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: +33: +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: No modifications detected for re-loaded extension module utils, skipping build step... +33: Loading extension module utils... +48: No modifications detected for re-loaded extension module utils, skipping build step... +48: Loading extension module utils... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: No modifications detected for re-loaded extension module utils, skipping build step... +33: Loading extension module utils... +48: No modifications detected for re-loaded extension module utils, skipping build step... +48: Loading extension module utils... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +33: +33: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +48: No modifications detected for re-loaded extension module utils, skipping build step... +48: Loading extension module utils... +48: No modifications detected for re-loaded extension module utils, skipping build step... +48: Loading extension module utils... +49: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +33: +33: Loading extension module utils...Loading extension module utils... +33: +33: Loading extension module utils... +33: Loading extension module utils... +33: Loading extension module utils... +33: +49: No modifications detected for re-loaded extension module utils, skipping build step... +49: Loading extension module utils... +33: No modifications detected for re-loaded extension module utils, skipping build step... +33: Loading extension module utils... +48: No modifications detected for re-loaded extension module utils, skipping build step... +48: Loading extension module utils... +49: No modifications detected for re-loaded extension module utils, skipping build step... +49: Loading extension module utils... +49: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +49: +49: Loading extension module utils...Loading extension module utils... +49: +49: No modifications detected for re-loaded extension module utils, skipping build step... +49: Loading extension module utils... +49: No modifications detected for re-loaded extension module utils, skipping build step... +49: Loading extension module utils... +49: No modifications detected for re-loaded extension module utils, skipping build step... +49: Loading extension module utils... +49: No modifications detected for re-loaded extension module utils, skipping build step... +49: Loading extension module utils... +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: +37: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +37: No modifications detected for re-loaded extension module utils, skipping build step... +37: Loading extension module utils... +37: No modifications detected for re-loaded extension module utils, skipping build step... +37: Loading extension module utils... +37: No modifications detected for re-loaded extension module utils, skipping build step... +37: Loading extension module utils... +37: No modifications detected for re-loaded extension module utils, skipping build step... +37: Loading extension module utils... +37: No modifications detected for re-loaded extension module utils, skipping build step... +37: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +37: +37: Loading extension module utils... +37: No modifications detected for re-loaded extension module utils, skipping build step... +37: Loading extension module utils... +37: No modifications detected for re-loaded extension module utils, skipping build step... +37: Loading extension module utils... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: No modifications detected for re-loaded extension module utils, skipping build step... +63: Loading extension module utils... +63: No modifications detected for re-loaded extension module utils, skipping build step... +63: Loading extension module utils... +63: No modifications detected for re-loaded extension module utils, skipping build step... +63: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +63: +63: Loading extension module utils... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +63: +63: Loading extension module utils... +63: Loading extension module utils... +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: +45: + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: No modifications detected for re-loaded extension module utils, skipping build step... +63: Loading extension module utils... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: + 8: No modifications detected for re-loaded extension module utils, skipping build step... + 8: Loading extension module utils... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +63: No modifications detected for re-loaded extension module utils, skipping build step... +63: Loading extension module utils... +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 8: +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: +41: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +45: +45: Loading extension module utils... +45: Loading extension module utils... +45: No modifications detected for re-loaded extension module utils, skipping build step... +45: Loading extension module utils... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + 8: + 8: + 8: + 8: + 8: Loading extension module utils...Loading extension module utils...Loading extension module utils... + 8: + 8: +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: No modifications detected for re-loaded extension module utils, skipping build step... +41: Loading extension module utils... +41: No modifications detected for re-loaded extension module utils, skipping build step... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: Loading extension module utils... +41: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +41: + 8: No modifications detected for re-loaded extension module utils, skipping build step... + 8: Loading extension module utils... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: No modifications detected for re-loaded extension module utils, skipping build step... +35: Loading extension module utils... +35: No modifications detected for re-loaded extension module utils, skipping build step... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: +41: Loading extension module utils...Loading extension module utils... +41: +41: No modifications detected for re-loaded extension module utils, skipping build step... +41: Loading extension module utils... +41: No modifications detected for re-loaded extension module utils, skipping build step... +45: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +45: No modifications detected for re-loaded extension module utils, skipping build step... +45: Loading extension module utils... +45: Loading extension module utils... +45: +45: Loading extension module utils... +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: Loading extension module utils... + 8: No modifications detected for re-loaded extension module utils, skipping build step... + 8: Loading extension module utils... +45: No modifications detected for re-loaded extension module utils, skipping build step... +45: Loading extension module utils... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +45: No modifications detected for re-loaded extension module utils, skipping build step... +45: Loading extension module utils... +12: +35: Loading extension module utils... +35: No modifications detected for re-loaded extension module utils, skipping build step... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: +54: +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Loading extension module utils... +35: No modifications detected for re-loaded extension module utils, skipping build step... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +35: Loading extension module utils... +35: No modifications detected for re-loaded extension module utils, skipping build step... +35: Loading extension module utils... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: No modifications detected for re-loaded extension module utils, skipping build step... +41: Loading extension module utils... +12: No modifications detected for re-loaded extension module utils, skipping build step... +12: Loading extension module utils... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... +54: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: No modifications detected for re-loaded extension module utils, skipping build step... +12: Loading extension module utils... +35: No modifications detected for re-loaded extension module utils, skipping build step... +35: Loading extension module utils... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +41: No modifications detected for re-loaded extension module utils, skipping build step... +41: Loading extension module utils... +35: No modifications detected for re-loaded extension module utils, skipping build step... +35: Loading extension module utils... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +30: +30: Loading extension module utils... +12: No modifications detected for re-loaded extension module utils, skipping build step... +12: Loading extension module utils... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +12: No modifications detected for re-loaded extension module utils, skipping build step... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +38: + 5: No modifications detected for re-loaded extension module utils, skipping build step... + 5: Loading extension module utils... +12: Loading extension module utils...Loading extension module utils... +12: +12: +12: Loading extension module utils... +12: No modifications detected for re-loaded extension module utils, skipping build step... +12: Loading extension module utils... +35: No modifications detected for re-loaded extension module utils, skipping build step... +35: Loading extension module utils... +30: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +30: +30: Loading extension module utils...Loading extension module utils... +30: +54: No modifications detected for re-loaded extension module utils, skipping build step... +54: Loading extension module utils... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +38: +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: No modifications detected for re-loaded extension module utils, skipping build step... + 5: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... + 5: + 5: +30: No modifications detected for re-loaded extension module utils, skipping build step... + 5: Loading extension module utils...Loading extension module utils... + 5: +12: No modifications detected for re-loaded extension module utils, skipping build step... +12: Loading extension module utils... +38: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: No modifications detected for re-loaded extension module utils, skipping build step... + 5: Loading extension module utils... +30: Loading extension module utils... + 5: No modifications detected for re-loaded extension module utils, skipping build step... + 5: Loading extension module utils... +38: No modifications detected for re-loaded extension module utils, skipping build step... +38: Loading extension module utils... +38: No modifications detected for re-loaded extension module utils, skipping build step... + 5: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 5: + 5: Loading extension module utils...Loading extension module utils... + 5: +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... +38: Loading extension module utils... +38: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +38: +38: Loading extension module utils... +38: Loading extension module utils... +38: No modifications detected for re-loaded extension module utils, skipping build step... +54: No modifications detected for re-loaded extension module utils, skipping build step... +54: Loading extension module utils... +38: Loading extension module utils... +54: No modifications detected for re-loaded extension module utils, skipping build step... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: Loading extension module utils... +38: No modifications detected for re-loaded extension module utils, skipping build step... +38: Loading extension module utils... +38: No modifications detected for re-loaded extension module utils, skipping build step... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: No modifications detected for re-loaded extension module utils, skipping build step... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: Loading extension module utils... +38: Loading extension module utils... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: No modifications detected for re-loaded extension module utils, skipping build step... +38: No modifications detected for re-loaded extension module utils, skipping build step... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +54: +54: Loading extension module utils... +38: Loading extension module utils... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +54: No modifications detected for re-loaded extension module utils, skipping build step... +54: Loading extension module utils... +42: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: No modifications detected for re-loaded extension module utils, skipping build step... +59: Loading extension module utils... +42: No modifications detected for re-loaded extension module utils, skipping build step... +42: Loading extension module utils... +54: No modifications detected for re-loaded extension module utils, skipping build step... +54: Loading extension module utils... +42: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +42: +42: Loading extension module utils...Loading extension module utils... +42: +59: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +59: No modifications detected for re-loaded extension module utils, skipping build step... +59: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +59: +59: Loading extension module utils... +59: Loading extension module utils... +59: +59: Loading extension module utils... +59: No modifications detected for re-loaded extension module utils, skipping build step... +59: Loading extension module utils... +59: No modifications detected for re-loaded extension module utils, skipping build step... +42: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +42: +42: Loading extension module utils...Loading extension module utils... +42: +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +59: Loading extension module utils... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: No modifications detected for re-loaded extension module utils, skipping build step... +42: Loading extension module utils... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: No modifications detected for re-loaded extension module utils, skipping build step... +42: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +42: +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +42: Loading extension module utils... +59: No modifications detected for re-loaded extension module utils, skipping build step... +59: Loading extension module utils... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +55: +55: Loading extension module utils... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: No modifications detected for re-loaded extension module utils, skipping build step... +55: Loading extension module utils... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +55: +10: No modifications detected for re-loaded extension module utils, skipping build step... +10: Loading extension module utils... +10: No modifications detected for re-loaded extension module utils, skipping build step... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: No modifications detected for re-loaded extension module utils, skipping build step... +51: Loading extension module utils... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Loading extension module utils... +10: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +10: +10: Loading extension module utils... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: No modifications detected for re-loaded extension module utils, skipping build step... +10: Loading extension module utils... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: +52: +51: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +10: +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: No modifications detected for re-loaded extension module utils, skipping build step... +55: Loading extension module utils... +10: Loading extension module utils...Loading extension module utils... +10: +10: No modifications detected for re-loaded extension module utils, skipping build step... +10: Loading extension module utils... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +51: +51: Loading extension module utils...Loading extension module utils... +51: + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: +55: No modifications detected for re-loaded extension module utils, skipping build step... +10: No modifications detected for re-loaded extension module utils, skipping build step... +10: Loading extension module utils... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +51: +51: +51: + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +55: Loading extension module utils... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +51: +51: +51: + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: +55: No modifications detected for re-loaded extension module utils, skipping build step... +55: Loading extension module utils... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +52: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: + 1: + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: +11: +27: No modifications detected for re-loaded extension module utils, skipping build step... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +52: No modifications detected for re-loaded extension module utils, skipping build step... +52: Loading extension module utils... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +51: No modifications detected for re-loaded extension module utils, skipping build step... + 1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 1: + 1: Loading extension module utils...Loading extension module utils... + 1: +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Loading extension module utils... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +46: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: +55: No modifications detected for re-loaded extension module utils, skipping build step... +55: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +55: + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: +11: +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: No modifications detected for re-loaded extension module utils, skipping build step... +34: Loading extension module utils... +55: Loading extension module utils... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +52: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: +51: Loading extension module utils... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +11: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +11: +11: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +11: Loading extension module utils... +11: +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: No modifications detected for re-loaded extension module utils, skipping build step... +34: Loading extension module utils... +46: No modifications detected for re-loaded extension module utils, skipping build step... +46: Loading extension module utils... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: Loading extension module utils... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... + 1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 1: + 1: Loading extension module utils...Loading extension module utils... + 1: + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: +18: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: + 2: + 2: + 7: No modifications detected for re-loaded extension module utils, skipping build step... + 7: Loading extension module utils... +11: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +46: No modifications detected for re-loaded extension module utils, skipping build step... +46: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +52: No modifications detected for re-loaded extension module utils, skipping build step... +52: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +52: +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +53: + 4: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 4: + 4: Loading extension module utils...Loading extension module utils... + 4: + 1: No modifications detected for re-loaded extension module utils, skipping build step... + 1: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... + 1: + 1: + 1: Loading extension module utils... + 7: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 7: + 7: + 7: Loading extension module utils...Loading extension module utils...Loading extension module utils... + 7: + 7: +25: Loading extension module utils... +36: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +34: No modifications detected for re-loaded extension module utils, skipping build step... +34: Loading extension module utils... +26: +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Loading extension module utils... +52: Loading extension module utils... +52: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +52: +52: Loading extension module utils... +52: Loading extension module utils... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +16: +16: +16: Loading extension module utils...Loading extension module utils...Loading extension module utils... +16: +16: + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 1: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... + 4: Loading extension module utils... + 1: No modifications detected for re-loaded extension module utils, skipping build step... +27: No modifications detected for re-loaded extension module utils, skipping build step... +25: No modifications detected for re-loaded extension module utils, skipping build step... +36: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +36: +36: +36: +36: +36: +36: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +36: Loading extension module utils... +36: +36: +36: +36: +46: No modifications detected for re-loaded extension module utils, skipping build step... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +20: +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +53: + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... + 1: Loading extension module utils... + 7: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 7: + 7: + 7: Loading extension module utils...Loading extension module utils...Loading extension module utils... + 7: + 7: +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +27: Loading extension module utils... +25: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +36: No modifications detected for re-loaded extension module utils, skipping build step... +36: Loading extension module utils... +34: No modifications detected for re-loaded extension module utils, skipping build step... +46: Loading extension module utils... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... +20: +20: +20: Loading extension module utils... +52: No modifications detected for re-loaded extension module utils, skipping build step... +52: Loading extension module utils... +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: No modifications detected for re-loaded extension module utils, skipping build step... + 7: Loading extension module utils... +11: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +11: +11: Loading extension module utils...Loading extension module utils... +11: +27: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +27: +27: Loading extension module utils...Loading extension module utils... +27: +36: No modifications detected for re-loaded extension module utils, skipping build step... +36: Loading extension module utils... +34: Loading extension module utils... +46: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +46: +46: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... +26: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +26: +26: Loading extension module utils...Loading extension module utils... +26: +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... +34: No modifications detected for re-loaded extension module utils, skipping build step... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +18: No modifications detected for re-loaded extension module utils, skipping build step... +52: No modifications detected for re-loaded extension module utils, skipping build step... +52: Loading extension module utils... +16: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +16: +34: Loading extension module utils... +46: No modifications detected for re-loaded extension module utils, skipping build step... +46: +46: +46: Loading extension module utils... +46: +46: Loading extension module utils...Loading extension module utils... +46: +18: Loading extension module utils... + 2: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 2: + 2: Loading extension module utils...Loading extension module utils... + 2: +53: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Loading extension module utils...Loading extension module utils... +16: + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +53: No modifications detected for re-loaded extension module utils, skipping build step... +53: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +34: No modifications detected for re-loaded extension module utils, skipping build step... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... +53: No modifications detected for re-loaded extension module utils, skipping build step... +53: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +53: +25: Loading extension module utils... +34: Loading extension module utils... +34: No modifications detected for re-loaded extension module utils, skipping build step... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +34: Loading extension module utils... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... +53: Loading extension module utils... +26: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... +53: No modifications detected for re-loaded extension module utils, skipping build step... +53: Loading extension module utils... +34: No modifications detected for re-loaded extension module utils, skipping build step... +34: Loading extension module utils... +26: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +53: No modifications detected for re-loaded extension module utils, skipping build step... +53: Loading extension module utils... + 2: No modifications detected for re-loaded extension module utils, skipping build step... +20: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... +53: No modifications detected for re-loaded extension module utils, skipping build step... +53: Loading extension module utils... +20: Loading extension module utils... +53: No modifications detected for re-loaded extension module utils, skipping build step... +53: Loading extension module utils... +53: No modifications detected for re-loaded extension module utils, skipping build step... +53: Loading extension module utils... +23: Loading extension module utils... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +31: +31: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +31: +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Loading extension module utils... +31: +31: Loading extension module utils...Loading extension module utils... +31: +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: +21: +62: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +62: +62: Loading extension module utils...Loading extension module utils... +62: +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: +57: +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +62: No modifications detected for re-loaded extension module utils, skipping build step... +62: Loading extension module utils... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: +62: No modifications detected for re-loaded extension module utils, skipping build step... +62: Loading extension module utils... +62: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +62: +57: No modifications detected for re-loaded extension module utils, skipping build step... +13: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +13: +13: +13: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... +13: +13: +13: Loading extension module utils... +13: Loading extension module utils... +24: +57: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: No modifications detected for re-loaded extension module utils, skipping build step... +21: Loading extension module utils... +62: Loading extension module utils...Loading extension module utils... +62: +57: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: No modifications detected for re-loaded extension module utils, skipping build step... +21: Loading extension module utils... +57: No modifications detected for re-loaded extension module utils, skipping build step... +21: No modifications detected for re-loaded extension module utils, skipping build step... +21: Loading extension module utils... +57: Loading extension module utils... +62: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +62: +62: Loading extension module utils... +62: Loading extension module utils... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: Loading extension module utils... +57: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +13: No modifications detected for re-loaded extension module utils, skipping build step... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +57: +57: +57: +57: +57: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +57: +57: +57: +13: Loading extension module utils... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... +24: No modifications detected for re-loaded extension module utils, skipping build step... +21: No modifications detected for re-loaded extension module utils, skipping build step... +21: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +21: +21: Loading extension module utils... +57: No modifications detected for re-loaded extension module utils, skipping build step... +21: No modifications detected for re-loaded extension module utils, skipping build step... +21: Loading extension module utils... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +57: Loading extension module utils... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: Loading extension module utils... +24: Loading extension module utils... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: +21: No modifications detected for re-loaded extension module utils, skipping build step... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Loading extension module utils... +21: No modifications detected for re-loaded extension module utils, skipping build step... +47: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Loading extension module utils... +47: No modifications detected for re-loaded extension module utils, skipping build step... +47: Loading extension module utils... +47: No modifications detected for re-loaded extension module utils, skipping build step... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... +24: No modifications detected for re-loaded extension module utils, skipping build step... +47: Loading extension module utils... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: No modifications detected for re-loaded extension module utils, skipping build step... +47: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +47: No modifications detected for re-loaded extension module utils, skipping build step... +47: +47: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +47: Loading extension module utils... +47: +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Loading extension module utils... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +47: Loading extension module utils... +47: No modifications detected for re-loaded extension module utils, skipping build step... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: +29: +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: +24: +24: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +24: +24: Loading extension module utils... +47: Loading extension module utils... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: Loading extension module utils... +22: No modifications detected for re-loaded extension module utils, skipping build step... +29: No modifications detected for re-loaded extension module utils, skipping build step... +29: Loading extension module utils... +40: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Loading extension module utils... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: No modifications detected for re-loaded extension module utils, skipping build step... +29: Loading extension module utils... +39: No modifications detected for re-loaded extension module utils, skipping build step... +39: Loading extension module utils... +22: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +22: +22: Loading extension module utils...Loading extension module utils... +22: +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +47: No modifications detected for re-loaded extension module utils, skipping build step... +47: Loading extension module utils... +39: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +39: +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +29: +29: Loading extension module utils...Loading extension module utils... +29: +40: No modifications detected for re-loaded extension module utils, skipping build step... +40: Loading extension module utils... +39: Loading extension module utils...Loading extension module utils... +39: +58: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: +29: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +29: +29: +29: +29: Loading extension module utils...Loading extension module utils...Loading extension module utils... +29: Loading extension module utils... +29: +29: +40: No modifications detected for re-loaded extension module utils, skipping build step... +40: Loading extension module utils... +39: No modifications detected for re-loaded extension module utils, skipping build step... +39: Loading extension module utils... +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +22: +22: Loading extension module utils... +58: No modifications detected for re-loaded extension module utils, skipping build step... +58: Loading extension module utils... +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: Loading extension module utils... +40: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +40: +40: +40: Loading extension module utils...Loading extension module utils...Loading extension module utils... +40: +40: +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: Loading extension module utils... +58: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +58: +40: No modifications detected for re-loaded extension module utils, skipping build step... +40: Loading extension module utils... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +39: +39: Loading extension module utils...Loading extension module utils... +39: +58: Loading extension module utils...Loading extension module utils... +58: +40: No modifications detected for re-loaded extension module utils, skipping build step... +40: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +40: +40: Loading extension module utils... +39: No modifications detected for re-loaded extension module utils, skipping build step... +39: Loading extension module utils... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +39: No modifications detected for re-loaded extension module utils, skipping build step... +39: Loading extension module utils... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +58: No modifications detected for re-loaded extension module utils, skipping build step... +58: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +58: +58: Loading extension module utils... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: +58: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +58: +58: +58: Loading extension module utils...Loading extension module utils...Loading extension module utils... +58: +58: +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: +28: +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: No modifications detected for re-loaded extension module utils, skipping build step... +60: Loading extension module utils... +61: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: No modifications detected for re-loaded extension module utils, skipping build step... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +60: Loading extension module utils... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +61: No modifications detected for re-loaded extension module utils, skipping build step... +61: Loading extension module utils... +61: No modifications detected for re-loaded extension module utils, skipping build step... +61: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +61: +60: No modifications detected for re-loaded extension module utils, skipping build step... +60: Loading extension module utils... +60: No modifications detected for re-loaded extension module utils, skipping build step... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +61: Loading extension module utils... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +28: +60: Loading extension module utils... +60: No modifications detected for re-loaded extension module utils, skipping build step... +60: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +60: +60: Loading extension module utils... +28: Loading extension module utils... +61: No modifications detected for re-loaded extension module utils, skipping build step... +61: Loading extension module utils... +60: No modifications detected for re-loaded extension module utils, skipping build step... +60: Loading extension module utils... +61: No modifications detected for re-loaded extension module utils, skipping build step... +61: Loading extension module utils... +60: No modifications detected for re-loaded extension module utils, skipping build step... +60: Loading extension module utils... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +61: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +61: +61: Loading extension module utils...Loading extension module utils... +61: +61: No modifications detected for re-loaded extension module utils, skipping build step... +61: Loading extension module utils... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +56: No modifications detected for re-loaded extension module utils, skipping build step... +56: Loading extension module utils... +56: No modifications detected for re-loaded extension module utils, skipping build step... +56: Loading extension module utils... +56: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +56: +56: +56: Loading extension module utils...Loading extension module utils...Loading extension module utils... +56: +56: +56: No modifications detected for re-loaded extension module utils, skipping build step... +56: Loading extension module utils... +56: No modifications detected for re-loaded extension module utils, skipping build step... +56: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +56: +56: Loading extension module utils... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + 0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/logs/2098877.out b/logs/2098877.out new file mode 100644 index 0000000000000000000000000000000000000000..ee1e3a515292f6e2cf4eec17fb14ba7d71b4a769 --- /dev/null +++ b/logs/2098877.out @@ -0,0 +1,17156 @@ +Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7beta --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.95 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7beta --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7beta --load checkpoints_8b7beta --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2098877.json --zero-stage 0 +START 2098877: Thu Dec 1 18:23:51 EET 2022 + 0: + 0: + 0: ======================= ROCm System Management Interface ======================= + 0: ================================= Concise Info ================================= + 0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 0: 0 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 0: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 0: 2 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 0: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 0: 4 47.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 0: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 0: 6 48.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 0: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 0: ================================================================================ + 0: ============================= End of ROCm SMI Log ============================== +56: +56: +56: ======================= ROCm System Management Interface ======================= +56: ================================= Concise Info ================================= +56: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +56: 0 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +56: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +56: 2 42.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +56: 3 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +56: 4 43.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +56: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +56: 6 42.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +56: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +56: ================================================================================ +56: ============================= End of ROCm SMI Log ============================== +62: +62: +62: ======================= ROCm System Management Interface ======================= +62: ================================= Concise Info ================================= +62: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +62: 0 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +62: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +62: 2 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +62: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +62: 4 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +62: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +62: 6 45.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +62: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +62: ================================================================================ +62: ============================= End of ROCm SMI Log ============================== +54: +54: +54: ======================= ROCm System Management Interface ======================= +54: ================================= Concise Info ================================= +54: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +54: 0 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +54: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +54: 2 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +54: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +54: 4 43.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +54: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +54: 6 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +54: 7 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +54: ================================================================================ +54: ============================= End of ROCm SMI Log ============================== +60: +60: +60: ======================= ROCm System Management Interface ======================= +60: ================================= Concise Info ================================= +60: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +60: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +60: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +60: 2 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +60: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +60: 4 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +60: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +60: 6 46.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +60: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +60: ================================================================================ +60: ============================= End of ROCm SMI Log ============================== +51: +51: +51: ======================= ROCm System Management Interface ======================= +51: ================================= Concise Info ================================= +51: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +51: 0 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +51: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +51: 2 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +51: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +51: 4 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +51: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +51: 6 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +51: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +51: ================================================================================ +51: ============================= End of ROCm SMI Log ============================== +63: +63: +63: ======================= ROCm System Management Interface ======================= +63: ================================= Concise Info ================================= +63: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +63: 0 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +63: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +63: 2 37.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +63: 3 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +63: 4 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +63: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +63: 6 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +63: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +63: ================================================================================ +63: ============================= End of ROCm SMI Log ============================== + 1: + 1: + 1: ======================= ROCm System Management Interface ======================= + 1: ================================= Concise Info ================================= + 1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 1: 0 45.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 1: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 1: 2 39.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 1: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 1: 4 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 1: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 1: 6 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 1: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 1: ================================================================================ + 1: ============================= End of ROCm SMI Log ============================== +38: +38: +38: ======================= ROCm System Management Interface ======================= +38: ================================= Concise Info ================================= +38: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +38: 0 40.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +38: 1 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +38: 2 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +38: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +38: 4 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +38: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +38: 6 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +38: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +38: ================================================================================ +38: ============================= End of ROCm SMI Log ============================== +39: +39: +39: ======================= ROCm System Management Interface ======================= +39: ================================= Concise Info ================================= +39: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +39: 0 50.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +39: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +39: 2 43.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +39: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +39: 4 42.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +39: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +39: 6 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +39: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +39: ================================================================================ +39: ============================= End of ROCm SMI Log ============================== +57: +57: +57: ======================= ROCm System Management Interface ======================= +57: ================================= Concise Info ================================= +57: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +57: 0 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +57: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +57: 2 42.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +57: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +57: 4 42.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +57: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +57: 6 38.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +57: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +57: ================================================================================ +57: ============================= End of ROCm SMI Log ============================== +25: +25: +25: ======================= ROCm System Management Interface ======================= +25: ================================= Concise Info ================================= +25: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +25: 0 45.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +25: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +25: 2 37.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +25: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +25: 4 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +25: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +25: 6 36.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +25: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +25: ================================================================================ +25: ============================= End of ROCm SMI Log ============================== +43: +43: +43: ======================= ROCm System Management Interface ======================= +43: ================================= Concise Info ================================= +43: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +43: 0 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +43: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +43: 2 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +43: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +43: 4 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +43: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +43: 6 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +43: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +43: ================================================================================ +43: ============================= End of ROCm SMI Log ============================== +27: +27: +27: ======================= ROCm System Management Interface ======================= +27: ================================= Concise Info ================================= +27: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +27: 0 40.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +27: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +27: 2 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +27: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +27: 4 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +27: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +27: 6 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +27: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +27: ================================================================================ +27: ============================= End of ROCm SMI Log ============================== +17: +17: +17: ======================= ROCm System Management Interface ======================= +17: ================================= Concise Info ================================= +17: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +17: 0 49.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +17: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +17: 2 39.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +17: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +17: 4 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +17: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +17: 6 42.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +17: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +17: ================================================================================ +17: ============================= End of ROCm SMI Log ============================== +11: +11: +11: ======================= ROCm System Management Interface ======================= +11: ================================= Concise Info ================================= +11: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +11: 0 45.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +11: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +11: 2 43.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +11: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +11: 4 39.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +11: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +11: 6 39.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +11: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +11: ================================================================================ +11: ============================= End of ROCm SMI Log ============================== +13: +13: +13: ======================= ROCm System Management Interface ======================= +13: ================================= Concise Info ================================= +13: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +13: 0 45.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +13: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +13: 2 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +13: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +13: 4 39.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +13: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +13: 6 38.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +13: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +13: ================================================================================ +13: ============================= End of ROCm SMI Log ============================== +36: +36: +36: ======================= ROCm System Management Interface ======================= +36: ================================= Concise Info ================================= +36: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +36: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +36: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +36: 2 39.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +36: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +36: 4 37.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +36: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +36: 6 39.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +36: 7 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +36: ================================================================================ +36: ============================= End of ROCm SMI Log ============================== +42: +42: +42: ======================= ROCm System Management Interface ======================= +42: ================================= Concise Info ================================= +42: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +42: 0 47.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +42: 1 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +42: 2 45.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +42: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +42: 4 49.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +42: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +42: 6 45.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +42: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +42: ================================================================================ +42: ============================= End of ROCm SMI Log ============================== +44: +44: +44: ======================= ROCm System Management Interface ======================= +44: ================================= Concise Info ================================= +44: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +44: 0 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +44: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +44: 2 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +44: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +44: 4 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +44: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +44: 6 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +44: 7 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +44: ================================================================================ +44: ============================= End of ROCm SMI Log ============================== +34: +34: +34: ======================= ROCm System Management Interface ======================= +34: ================================= Concise Info ================================= +34: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +34: 0 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +34: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +34: 2 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +34: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +34: 4 42.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +34: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +34: 6 34.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +34: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +34: ================================================================================ +34: ============================= End of ROCm SMI Log ============================== +28: +28: +28: ======================= ROCm System Management Interface ======================= +28: ================================= Concise Info ================================= +28: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +28: 0 49.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +28: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +28: 2 39.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +28: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +28: 4 38.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +28: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +28: 6 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +28: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +28: ================================================================================ +28: ============================= End of ROCm SMI Log ============================== +32: +32: +32: ======================= ROCm System Management Interface ======================= +32: ================================= Concise Info ================================= +32: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +32: 0 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +32: 1 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +32: 2 41.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +32: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +32: 4 39.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +32: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +32: 6 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +32: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +32: ================================================================================ +32: ============================= End of ROCm SMI Log ============================== +50: +50: +50: ======================= ROCm System Management Interface ======================= +50: ================================= Concise Info ================================= +50: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +50: 0 43.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +50: 1 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +50: 2 43.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +50: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +50: 4 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +50: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +50: 6 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +50: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +50: ================================================================================ +50: ============================= End of ROCm SMI Log ============================== +46: +46: +46: ======================= ROCm System Management Interface ======================= +46: ================================= Concise Info ================================= +46: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +46: 0 49.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +46: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +46: 2 35.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +46: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +46: 4 41.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +46: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +46: 6 40.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +46: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +46: ================================================================================ +46: ============================= End of ROCm SMI Log ============================== +26: +26: +26: ======================= ROCm System Management Interface ======================= +26: ================================= Concise Info ================================= +26: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +26: 0 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +26: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +26: 2 41.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +26: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +26: 4 36.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +26: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +26: 6 37.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +26: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +26: ================================================================================ +26: ============================= End of ROCm SMI Log ============================== +49: +49: +49: ======================= ROCm System Management Interface ======================= +49: ================================= Concise Info ================================= +49: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +49: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +49: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +49: 2 47.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +49: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +49: 4 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +49: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +49: 6 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +49: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +49: ================================================================================ +49: ============================= End of ROCm SMI Log ============================== + 5: + 5: + 5: ======================= ROCm System Management Interface ======================= + 5: ================================= Concise Info ================================= + 5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 5: 0 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 5: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 5: 2 38.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 5: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 5: 4 39.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 5: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 5: 6 47.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 5: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 5: ================================================================================ + 5: ============================= End of ROCm SMI Log ============================== +41: +41: +41: ======================= ROCm System Management Interface ======================= +41: ================================= Concise Info ================================= +41: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +41: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +41: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +41: 2 37.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +41: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +41: 4 42.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +41: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +41: 6 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +41: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +41: ================================================================================ +41: ============================= End of ROCm SMI Log ============================== +20: +20: +20: ======================= ROCm System Management Interface ======================= +20: ================================= Concise Info ================================= +20: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +20: 0 47.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +20: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +20: 2 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +20: 3 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +20: 4 44.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +20: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +20: 6 45.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +20: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +20: ================================================================================ +20: ============================= End of ROCm SMI Log ============================== +37: +37: +37: ======================= ROCm System Management Interface ======================= +37: ================================= Concise Info ================================= +37: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +37: 0 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +37: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +37: 2 40.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +37: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +37: 4 39.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +37: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +37: 6 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +37: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +37: ================================================================================ +37: ============================= End of ROCm SMI Log ============================== +10: +10: +10: ======================= ROCm System Management Interface ======================= +10: ================================= Concise Info ================================= +10: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +10: 0 44.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +10: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +10: 2 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +10: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +10: 4 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +10: 5 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +10: 6 37.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +10: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +10: ================================================================================ +10: ============================= End of ROCm SMI Log ============================== + 4: + 4: + 4: ======================= ROCm System Management Interface ======================= + 4: ================================= Concise Info ================================= + 4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 4: 0 50.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 4: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 4: 2 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 4: 3 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 4: 4 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 4: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 4: 6 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 4: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 4: ================================================================================ + 4: ============================= End of ROCm SMI Log ============================== +14: +14: +14: ======================= ROCm System Management Interface ======================= +14: ================================= Concise Info ================================= +14: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +14: 0 48.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +14: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +14: 2 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +14: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +14: 4 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +14: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +14: 6 40.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +14: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +14: ================================================================================ +14: ============================= End of ROCm SMI Log ============================== +22: +22: +22: ======================= ROCm System Management Interface ======================= +22: ================================= Concise Info ================================= +22: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +22: 0 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +22: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +22: 2 43.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +22: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +22: 4 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +22: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +22: 6 34.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +22: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +22: ================================================================================ +22: ============================= End of ROCm SMI Log ============================== +24: +24: +24: ======================= ROCm System Management Interface ======================= +24: ================================= Concise Info ================================= +24: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +24: 0 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +24: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +24: 2 40.0c 78.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +24: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +24: 4 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +24: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +24: 6 38.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +24: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +24: ================================================================================ +24: ============================= End of ROCm SMI Log ============================== +18: +18: +18: ======================= ROCm System Management Interface ======================= +18: ================================= Concise Info ================================= +18: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +18: 0 35.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +18: 1 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +18: 2 42.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +18: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +18: 4 39.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +18: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +18: 6 43.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +18: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +18: ================================================================================ +18: ============================= End of ROCm SMI Log ============================== + 8: + 8: + 8: ======================= ROCm System Management Interface ======================= + 8: ================================= Concise Info ================================= + 8: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 8: 0 43.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 8: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 8: 2 45.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 8: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 8: 4 39.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 8: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 8: 6 33.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 8: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 8: ================================================================================ + 8: ============================= End of ROCm SMI Log ============================== +61: +61: +61: ======================= ROCm System Management Interface ======================= +61: ================================= Concise Info ================================= +61: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +61: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +61: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +61: 2 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +61: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +61: 4 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +61: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +61: 6 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +61: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +61: ================================================================================ +61: ============================= End of ROCm SMI Log ============================== +31: +31: +31: ======================= ROCm System Management Interface ======================= +31: ================================= Concise Info ================================= +31: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +31: 0 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +31: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +31: 2 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +31: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +31: 4 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +31: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +31: 6 42.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +31: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +31: ================================================================================ +31: ============================= End of ROCm SMI Log ============================== +55: +55: +55: ======================= ROCm System Management Interface ======================= +55: ================================= Concise Info ================================= +55: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +55: 0 47.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +55: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +55: 2 39.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +55: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +55: 4 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +55: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +55: 6 42.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +55: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +55: ================================================================================ +55: ============================= End of ROCm SMI Log ============================== +58: +58: +58: ======================= ROCm System Management Interface ======================= +58: ================================= Concise Info ================================= +58: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +58: 0 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +58: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +58: 2 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +58: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +58: 4 46.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +58: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +58: 6 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +58: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +58: ================================================================================ +58: ============================= End of ROCm SMI Log ============================== + 7: + 7: + 7: ======================= ROCm System Management Interface ======================= + 7: ================================= Concise Info ================================= + 7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 7: 0 45.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 7: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 7: 2 49.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 7: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 7: 4 45.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 7: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 7: 6 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 7: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 7: ================================================================================ + 7: ============================= End of ROCm SMI Log ============================== +21: +21: +21: ======================= ROCm System Management Interface ======================= +21: ================================= Concise Info ================================= +21: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +21: 0 46.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +21: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +21: 2 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +21: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +21: 4 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +21: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +21: 6 39.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +21: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +21: ================================================================================ +21: ============================= End of ROCm SMI Log ============================== +33: +33: +33: ======================= ROCm System Management Interface ======================= +33: ================================= Concise Info ================================= +33: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +33: 0 44.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +33: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +33: 2 38.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +33: 3 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +33: 4 47.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +33: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +33: 6 40.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +33: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +33: ================================================================================ +33: ============================= End of ROCm SMI Log ============================== +45: +45: +45: ======================= ROCm System Management Interface ======================= +45: ================================= Concise Info ================================= +45: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +45: 0 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +45: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +45: 2 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +45: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +45: 4 41.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +45: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +45: 6 46.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +45: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +45: ================================================================================ +45: ============================= End of ROCm SMI Log ============================== +12: +12: +12: ======================= ROCm System Management Interface ======================= +12: ================================= Concise Info ================================= +12: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +12: 0 41.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +12: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +12: 2 41.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +12: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +12: 4 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +12: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +12: 6 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +12: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +12: ================================================================================ +12: ============================= End of ROCm SMI Log ============================== +15: +15: +15: ======================= ROCm System Management Interface ======================= +15: ================================= Concise Info ================================= +15: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +15: 0 44.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +15: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +15: 2 37.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +15: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +15: 4 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +15: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +15: 6 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +15: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +15: ================================================================================ +15: ============================= End of ROCm SMI Log ============================== +29: +29: +29: ======================= ROCm System Management Interface ======================= +29: ================================= Concise Info ================================= +29: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +29: 0 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +29: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +29: 2 39.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +29: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +29: 4 45.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +29: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +29: 6 41.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +29: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +29: ================================================================================ +29: ============================= End of ROCm SMI Log ============================== +47: +47: +47: ======================= ROCm System Management Interface ======================= +47: ================================= Concise Info ================================= +47: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +47: 0 43.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +47: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +47: 2 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +47: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +47: 4 41.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +47: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +47: 6 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +47: 7 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +47: ================================================================================ +47: ============================= End of ROCm SMI Log ============================== + 6: + 6: + 6: ======================= ROCm System Management Interface ======================= + 6: ================================= Concise Info ================================= + 6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 6: 0 43.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 6: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 6: 2 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 6: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 6: 4 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 6: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 6: 6 38.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 6: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 6: ================================================================================ + 6: ============================= End of ROCm SMI Log ============================== +52: +52: +52: ======================= ROCm System Management Interface ======================= +52: ================================= Concise Info ================================= +52: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +52: 0 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +52: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +52: 2 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +52: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +52: 4 39.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +52: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +52: 6 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +52: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +52: ================================================================================ +52: ============================= End of ROCm SMI Log ============================== +48: +48: +48: ======================= ROCm System Management Interface ======================= +48: ================================= Concise Info ================================= +48: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +48: 0 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +48: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +48: 2 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +48: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +48: 4 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +48: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +48: 6 45.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +48: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +48: ================================================================================ +48: ============================= End of ROCm SMI Log ============================== +59: +59: +59: ======================= ROCm System Management Interface ======================= +59: ================================= Concise Info ================================= +59: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +59: 0 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +59: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +59: 2 45.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +59: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +59: 4 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +59: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +59: 6 34.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +59: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +59: ================================================================================ +59: ============================= End of ROCm SMI Log ============================== + 9: + 9: + 9: ======================= ROCm System Management Interface ======================= + 9: ================================= Concise Info ================================= + 9: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 9: 0 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 9: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 9: 2 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 9: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 9: 4 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 9: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 9: 6 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 9: 7 37.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 9: ================================================================================ + 9: ============================= End of ROCm SMI Log ============================== +53: +53: +53: ======================= ROCm System Management Interface ======================= +53: ================================= Concise Info ================================= +53: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +53: 0 45.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +53: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +53: 2 48.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +53: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +53: 4 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +53: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +53: 6 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +53: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +53: ================================================================================ +53: ============================= End of ROCm SMI Log ============================== + 2: + 2: + 2: ======================= ROCm System Management Interface ======================= + 2: ================================= Concise Info ================================= + 2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 2: 0 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 2: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 2: 2 38.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 2: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 2: 4 46.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 2: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 2: 6 37.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 2: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 2: ================================================================================ + 2: ============================= End of ROCm SMI Log ============================== +40: +40: +40: ======================= ROCm System Management Interface ======================= +40: ================================= Concise Info ================================= +40: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +40: 0 48.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +40: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +40: 2 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +40: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +40: 4 47.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +40: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +40: 6 45.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +40: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +40: ================================================================================ +40: ============================= End of ROCm SMI Log ============================== +35: +35: +35: ======================= ROCm System Management Interface ======================= +35: ================================= Concise Info ================================= +35: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +35: 0 46.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +35: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +35: 2 46.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +35: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +35: 4 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +35: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +35: 6 46.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +35: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +35: ================================================================================ +35: ============================= End of ROCm SMI Log ============================== + 3: + 3: + 3: ======================= ROCm System Management Interface ======================= + 3: ================================= Concise Info ================================= + 3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 3: 0 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 3: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 3: 2 40.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 3: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 3: 4 38.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 3: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 3: 6 36.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 3: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 3: ================================================================================ + 3: ============================= End of ROCm SMI Log ============================== +23: +23: +23: ======================= ROCm System Management Interface ======================= +23: ================================= Concise Info ================================= +23: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +23: 0 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +23: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +23: 2 46.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +23: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +23: 4 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +23: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +23: 6 40.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +23: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +23: ================================================================================ +23: ============================= End of ROCm SMI Log ============================== +30: +30: +30: ======================= ROCm System Management Interface ======================= +30: ================================= Concise Info ================================= +30: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +30: 0 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +30: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +30: 2 36.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +30: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +30: 4 47.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +30: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +30: 6 43.0c 79.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +30: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +30: ================================================================================ +30: ============================= End of ROCm SMI Log ============================== +19: +19: +19: ======================= ROCm System Management Interface ======================= +19: ================================= Concise Info ================================= +19: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +19: 0 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +19: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +19: 2 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +19: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +19: 4 45.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +19: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +19: 6 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +19: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +19: ================================================================================ +19: ============================= End of ROCm SMI Log ============================== +16: +16: +16: ======================= ROCm System Management Interface ======================= +16: ================================= Concise Info ================================= +16: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +16: 0 43.0c 100.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +16: 1 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +16: 2 38.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +16: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +16: 4 40.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +16: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +16: 6 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +16: 7 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +16: ================================================================================ +16: ============================= End of ROCm SMI Log ============================== +10: Launching on nid005548 (10/64), master nid005302 port 9999, GPUs 8, CUDA: True + 6: Launching on nid005544 (6/64), master nid005302 port 9999, GPUs 8, CUDA: True +16: Launching on nid006150 (16/64), master nid005302 port 9999, GPUs 8, CUDA: True +24: Launching on nid006279 (24/64), master nid005302 port 9999, GPUs 8, CUDA: True +43: Launching on nid006787 (43/64), master nid005302 port 9999, GPUs 8, CUDA: True +41: Launching on nid006785 (41/64), master nid005302 port 9999, GPUs 8, CUDA: True +40: Launching on nid006771 (40/64), master nid005302 port 9999, GPUs 8, CUDA: True +13: Launching on nid005552 (13/64), master nid005302 port 9999, GPUs 8, CUDA: True + 7: Launching on nid005545 (7/64), master nid005302 port 9999, GPUs 8, CUDA: True +63: Launching on nid007434 (63/64), master nid005302 port 9999, GPUs 8, CUDA: True +11: Launching on nid005549 (11/64), master nid005302 port 9999, GPUs 8, CUDA: True +33: Launching on nid006550 (33/64), master nid005302 port 9999, GPUs 8, CUDA: True +27: Launching on nid006318 (27/64), master nid005302 port 9999, GPUs 8, CUDA: True + 9: Launching on nid005547 (9/64), master nid005302 port 9999, GPUs 8, CUDA: True +17: Launching on nid006151 (17/64), master nid005302 port 9999, GPUs 8, CUDA: True +59: Launching on nid007426 (59/64), master nid005302 port 9999, GPUs 8, CUDA: True +35: Launching on nid006552 (35/64), master nid005302 port 9999, GPUs 8, CUDA: True +18: Launching on nid006152 (18/64), master nid005302 port 9999, GPUs 8, CUDA: True +12: Launching on nid005550 (12/64), master nid005302 port 9999, GPUs 8, CUDA: True +48: Launching on nid006920 (48/64), master nid005302 port 9999, GPUs 8, CUDA: True + 1: Launching on nid005304 (1/64), master nid005302 port 9999, GPUs 8, CUDA: True +53: Launching on nid007199 (53/64), master nid005302 port 9999, GPUs 8, CUDA: True +44: Launching on nid006788 (44/64), master nid005302 port 9999, GPUs 8, CUDA: True +32: Launching on nid006549 (32/64), master nid005302 port 9999, GPUs 8, CUDA: True +51: Launching on nid007197 (51/64), master nid005302 port 9999, GPUs 8, CUDA: True +29: Launching on nid006546 (29/64), master nid005302 port 9999, GPUs 8, CUDA: True + 3: Launching on nid005457 (3/64), master nid005302 port 9999, GPUs 8, CUDA: True +19: Launching on nid006153 (19/64), master nid005302 port 9999, GPUs 8, CUDA: True +21: Launching on nid006155 (21/64), master nid005302 port 9999, GPUs 8, CUDA: True +42: Launching on nid006786 (42/64), master nid005302 port 9999, GPUs 8, CUDA: True +47: Launching on nid006918 (47/64), master nid005302 port 9999, GPUs 8, CUDA: True +23: Launching on nid006277 (23/64), master nid005302 port 9999, GPUs 8, CUDA: True +50: Launching on nid007196 (50/64), master nid005302 port 9999, GPUs 8, CUDA: True +58: Launching on nid007423 (58/64), master nid005302 port 9999, GPUs 8, CUDA: True +55: Launching on nid007365 (55/64), master nid005302 port 9999, GPUs 8, CUDA: True +60: Launching on nid007427 (60/64), master nid005302 port 9999, GPUs 8, CUDA: True +22: Launching on nid006276 (22/64), master nid005302 port 9999, GPUs 8, CUDA: True + 8: Launching on nid005546 (8/64), master nid005302 port 9999, GPUs 8, CUDA: True +45: Launching on nid006916 (45/64), master nid005302 port 9999, GPUs 8, CUDA: True +26: Launching on nid006317 (26/64), master nid005302 port 9999, GPUs 8, CUDA: True +46: Launching on nid006917 (46/64), master nid005302 port 9999, GPUs 8, CUDA: True +52: Launching on nid007198 (52/64), master nid005302 port 9999, GPUs 8, CUDA: True +39: Launching on nid006701 (39/64), master nid005302 port 9999, GPUs 8, CUDA: True +15: Launching on nid006148 (15/64), master nid005302 port 9999, GPUs 8, CUDA: True +25: Launching on nid006316 (25/64), master nid005302 port 9999, GPUs 8, CUDA: True +57: Launching on nid007422 (57/64), master nid005302 port 9999, GPUs 8, CUDA: True + 5: Launching on nid005510 (5/64), master nid005302 port 9999, GPUs 8, CUDA: True +31: Launching on nid006548 (31/64), master nid005302 port 9999, GPUs 8, CUDA: True +37: Launching on nid006601 (37/64), master nid005302 port 9999, GPUs 8, CUDA: True + 4: Launching on nid005509 (4/64), master nid005302 port 9999, GPUs 8, CUDA: True + 2: Launching on nid005456 (2/64), master nid005302 port 9999, GPUs 8, CUDA: True +36: Launching on nid006600 (36/64), master nid005302 port 9999, GPUs 8, CUDA: True +20: Launching on nid006154 (20/64), master nid005302 port 9999, GPUs 8, CUDA: True +61: Launching on nid007428 (61/64), master nid005302 port 9999, GPUs 8, CUDA: True +34: Launching on nid006551 (34/64), master nid005302 port 9999, GPUs 8, CUDA: True +38: Launching on nid006615 (38/64), master nid005302 port 9999, GPUs 8, CUDA: True +28: Launching on nid006545 (28/64), master nid005302 port 9999, GPUs 8, CUDA: True +14: Launching on nid005889 (14/64), master nid005302 port 9999, GPUs 8, CUDA: True + 0: Launching on nid005302 (0/64), master nid005302 port 9999, GPUs 8, CUDA: True +56: Launching on nid007420 (56/64), master nid005302 port 9999, GPUs 8, CUDA: True +49: Launching on nid007195 (49/64), master nid005302 port 9999, GPUs 8, CUDA: True +54: Launching on nid007200 (54/64), master nid005302 port 9999, GPUs 8, CUDA: True +30: Launching on nid006547 (30/64), master nid005302 port 9999, GPUs 8, CUDA: True +62: Launching on nid007429 (62/64), master nid005302 port 9999, GPUs 8, CUDA: True + 0: using world size: 512, data-parallel-size: 128, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 + 0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. + 0: using torch.bfloat16 for parameters ... + 0: ------------------------ arguments ------------------------ + 0: abort_on_unmet_fused_kernel_constraints ......... False + 0: accumulate_allreduce_grads_in_fp32 .............. True + 0: adam_beta1 ...................................... 0.9 + 0: adam_beta2 ...................................... 0.95 + 0: adam_eps ........................................ 1e-08 + 0: adlr_autoresume ................................. False + 0: adlr_autoresume_interval ........................ 1000 + 0: apply_query_key_layer_scaling ................... True + 0: apply_residual_connection_post_layernorm ........ False + 0: attention_dropout ............................... 0.1 + 0: attention_softmax_in_fp32 ....................... False + 0: bert_binary_head ................................ True + 0: bert_load ....................................... None + 0: bf16 ............................................ True + 0: bias_dropout_fusion ............................. True + 0: bias_gelu_fusion ................................ True + 0: biencoder_projection_dim ........................ 0 + 0: biencoder_shared_query_context_model ............ False + 0: block_data_path ................................. None + 0: checkpoint_activations .......................... False + 0: checkpoint_in_cpu ............................... False + 0: checkpoint_num_layers ........................... 1 + 0: clip_grad ....................................... 1.0 + 0: codecarbon_dir .................................. None + 0: consumed_train_samples .......................... 0 + 0: consumed_train_tokens ........................... 0 + 0: consumed_valid_samples .......................... 0 + 0: contigious_checkpointing ........................ False + 0: cpu_optimizer ................................... False + 0: cpu_torch_adam .................................. False + 0: curriculum_learning ............................. False + 0: data_impl ....................................... mmap + 0: data_parallel_size .............................. 128 + 0: data_path ....................................... ['/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document'] + 0: dataloader_type ................................. single + 0: DDP_impl ........................................ local + 0: decoder_seq_length .............................. None + 0: deepscale ....................................... False + 0: deepscale_config ................................ None + 0: deepspeed ....................................... True + 0: deepspeed_activation_checkpointing .............. False + 0: deepspeed_config ................................ ds_configs/2098877.json + 0: deepspeed_mpi ................................... False + 0: distribute_checkpointed_activations ............. False + 0: distributed_backend ............................. nccl + 0: embed_layernorm ................................. False + 0: embedding_path .................................. None + 0: encoder_seq_length .............................. 2048 + 0: eod_mask_loss ................................... False + 0: eval_interval ................................... 1000 + 0: eval_iters ...................................... 1 + 0: eval_only ....................................... None + 0: evidence_data_path .............................. None + 0: exit_duration_in_mins ........................... None + 0: exit_interval ................................... None + 0: ffn_hidden_size ................................. 16384 + 0: finetune ........................................ False + 0: fp16 ............................................ False + 0: fp16_lm_cross_entropy ........................... False + 0: fp32_residual_connection ........................ False + 0: gigaflos_no_embeds .............................. 0 + 0: global_batch_size ............................... 1024 + 0: glu_activation .................................. None + 0: hidden_dropout .................................. 0.1 + 0: hidden_size ..................................... 4096 + 0: hysteresis ...................................... 2 + 0: ict_head_size ................................... None + 0: ict_load ........................................ None + 0: img_dim ......................................... 224 + 0: indexer_batch_size .............................. 128 + 0: indexer_log_interval ............................ 1000 + 0: inference ....................................... False + 0: init_method_std ................................. 0.02 + 0: init_method_xavier_uniform ...................... False + 0: initial_loss_scale .............................. 4294967296 + 0: kill_switch_path ................................ kill-switch-8b7beta + 0: kv_channels ..................................... 128 + 0: layer_norm_fusion ............................... True + 0: layernorm_epsilon ............................... 1e-05 + 0: lazy_mpu_init ................................... None + 0: load ............................................ checkpoints_8b7beta + 0: local_rank ...................................... None + 0: log_batch_size_to_tensorboard ................... True + 0: log_interval .................................... 10 + 0: log_learning_rate_to_tensorboard ................ True + 0: log_level ....................................... None + 0: log_level_replica ............................... None + 0: log_loss_scale_to_tensorboard ................... True + 0: log_num_zeros_in_grad ........................... False + 0: log_params_norm ................................. False + 0: log_path ........................................ None + 0: log_timers_to_tensorboard ....................... True + 0: log_validation_ppl_to_tensorboard ............... True + 0: loss_on_targets_only ............................ False + 0: loss_scale ...................................... None + 0: loss_scale_window ............................... 1000 + 0: lr .............................................. 0.0002 + 0: lr_decay_iters .................................. None + 0: lr_decay_samples ................................ 5625981 + 0: lr_decay_style .................................. cosine + 0: lr_decay_tokens ................................. None + 0: lr_warmup_fraction .............................. None + 0: lr_warmup_iters ................................. 0 + 0: lr_warmup_samples ............................... 56260 + 0: make_vocab_size_divisible_by .................... 128 + 0: mask_prob ....................................... 0.15 + 0: masked_softmax_fusion ........................... True + 0: max_position_embeddings ......................... 2048 + 0: mean_noise_span_length .......................... None + 0: memory_centric_tiled_linear ..................... False + 0: merge_file ...................................... gpt2/merges.txt + 0: micro_batch_size ................................ 2 + 0: min_loss_scale .................................. 1.0 + 0: min_lr .......................................... 2e-05 + 0: mmap_warmup ..................................... False + 0: no_load_optim ................................... None + 0: no_load_rng ..................................... None + 0: no_save_optim ................................... None + 0: no_save_rng ..................................... None + 0: noise_density ................................... None + 0: num_attention_heads ............................. 32 + 0: num_channels .................................... 3 + 0: num_classes ..................................... 1000 + 0: num_layers ...................................... 42 + 0: num_layers_per_virtual_pipeline_stage ........... None + 0: num_workers ..................................... 2 + 0: onnx_safe ....................................... None + 0: openai_gelu ..................................... False + 0: optimizer ....................................... adam + 0: optimizer_fusion ................................ True + 0: override_lr_scheduler ........................... False + 0: pad_vocab_size_to ............................... None + 0: params_dtype .................................... torch.bfloat16 + 0: partition_activations ........................... False + 0: patch_dim ....................................... 16 + 0: pipeline_model_parallel_size .................... 2 + 0: position_embedding_type ......................... PositionEmbeddingType.absolute + 0: pp_partition_method ............................. None + 0: profile_backward ................................ False + 0: query_in_block_prob ............................. 0.1 + 0: rampup_batch_size ............................... None + 0: rank ............................................ 0 + 0: remote_device ................................... none + 0: reset_attention_mask ............................ False + 0: reset_position_ids .............................. False + 0: retriever_report_topk_accuracies ................ [] + 0: retriever_score_scaling ......................... False + 0: retriever_seq_length ............................ 256 + 0: reweight_loss_based_on_position_frequency ....... False + 0: sample_rate ..................................... 1.0 + 0: save ............................................ checkpoints_8b7beta + 0: save_interval ................................... 1000 + 0: scatter_gather_tensors_in_pipeline .............. True + 0: scattered_embeddings ............................ False + 0: seed ............................................ 1234 + 0: seq_length ...................................... 2048 + 0: sgd_momentum .................................... 0.9 + 0: short_seq_prob .................................. 0.1 + 0: skip_train_iteration_range ...................... None + 0: split ........................................... 949,50,1 + 0: split_transformers .............................. False + 0: sync_tp_duplicated_parameters ................... False + 0: synchronize_each_layer .......................... False + 0: tensor_model_parallel_size ...................... 2 + 0: tensorboard_dir ................................. tensorboard_8b7beta + 0: tensorboard_log_interval ........................ 1 + 0: tensorboard_queue_size .......................... 5 + 0: test_weighted_split_names ....................... None + 0: test_weighted_split_paths ....................... None + 0: test_weighted_split_paths_path .................. None + 0: test_weighted_split_splits ...................... None + 0: test_weighted_split_weights ..................... None + 0: tile_factor ..................................... 1 + 0: titles_data_path ................................ None + 0: tokenizer_name_or_path .......................... None + 0: tokenizer_type .................................. GPT2BPETokenizer + 0: train_iters ..................................... None + 0: train_samples ................................... 5625981 + 0: train_tokens .................................... None + 0: train_weighted_split_paths ...................... None + 0: train_weighted_split_paths_path ................. None + 0: universal_checkpoint ............................ False + 0: use_bnb_optimizer ............................... False + 0: use_checkpoint_lr_scheduler ..................... False + 0: use_contiguous_buffers_in_ddp ................... True + 0: use_cpu_initialization .......................... None + 0: use_one_sent_docs ............................... False + 0: use_pin_memory .................................. False + 0: valid_num_workers ............................... 2 + 0: valid_weighted_split_names ...................... None + 0: valid_weighted_split_paths ...................... None + 0: valid_weighted_split_paths_path ................. None + 0: valid_weighted_split_splits ..................... None + 0: valid_weighted_split_weights .................... None + 0: virtual_pipeline_model_parallel_size ............ None + 0: vocab_extra_ids ................................. 0 + 0: vocab_file ...................................... gpt2/vocab.json + 0: weight_decay .................................... 0.1 + 0: world_size ...................................... 512 + 0: zero_allgather_bucket_size ...................... 0.0 + 0: zero_contigious_gradients ....................... False + 0: zero_reduce_bucket_size ......................... 0.0 + 0: zero_reduce_scatter ............................. False + 0: zero_stage ...................................... 0 + 0: -------------------- end of arguments --------------------- + 0: setting number of micro-batches to constant 4 + 0: > building GPT2BPETokenizer tokenizer ... + 0: > padded vocab (size: 50257) with 175 dummy tokens (new size: 50432) + 0: DeepSpeed general environment info: + 0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] + 0: torch version .................... 1.13.0+rocm5.2 + 0: torch cuda version ............... None + 0: torch hip version ................ 5.2.21151-afdc89f8 + 0: nvcc version ..................... None + 0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] + 0: deepspeed info ................... 0.7.5, unknown, unknown + 0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 + 0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** + 0: > initializing torch distributed ... + 0: [2022-12-01 18:25:37,396] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +63: > setting tensorboard ... + 0: > initializing tensor model parallel with size 2 + 0: > initializing pipeline model parallel with size 2 + 0: > setting random seeds to 1234 ... + 0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 + 0: > compiling dataset index builder ... + 0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' + 0: make: Nothing to be done for 'default'. + 0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' + 0: >>> done with dataset index builder. Compilation time: 0.100 seconds + 0: > compiling and loading fused kernels ... + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] + 0: Total number of unsupported CUDA function calls: 0 + 0: + 0: + 0: Total number of replaced kernel launches: 87 + 0: [1/1] c++ scaled_upper_triang_masked_softmax_hip.o scaled_upper_triang_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_upper_triang_masked_softmax_cuda.so + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] + 0: Total number of unsupported CUDA function calls: 0 + 0: + 0: + 0: Total number of replaced kernel launches: 63 + 0: [1/1] c++ scaled_masked_softmax_hip.cuda.o scaled_masked_softmax_hip.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] + 0: Total number of unsupported CUDA function calls: 0 + 0: + 0: + 0: Total number of replaced kernel launches: 67 + 0: [1/1] c++ layer_norm_cuda.o layer_norm_hip_kernel.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so + 0: >>> done with compiling and loading fused kernels. Compilation time: 33.429 seconds + 0: time to initialize megatron (seconds): 57.053 + 0: [after megatron is initialized] datetime: 2022-12-01 18:26:33 + 0: building GPT model ... + 0: [2022-12-01 18:26:33,111] [INFO] [utils.py:827:see_memory_usage] Before Building Model + 0: [2022-12-01 18:26:33,111] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB + 0: [2022-12-01 18:26:33,111] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 29.53 GB, percent = 5.9% + 0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None + 0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15, ProcessCoord(pipe=0, data=8, model=0): 16, ProcessCoord(pipe=0, data=8, model=1): 17, ProcessCoord(pipe=0, data=9, model=0): 18, ProcessCoord(pipe=0, data=9, model=1): 19, ProcessCoord(pipe=0, data=10, model=0): 20, ProcessCoord(pipe=0, data=10, model=1): 21, ProcessCoord(pipe=0, data=11, model=0): 22, ProcessCoord(pipe=0, data + 0: =11, model=1): 23, ProcessCoord(pipe=0, data=12, model=0): 24, ProcessCoord(pipe=0, data=12, model=1): 25, ProcessCoord(pipe=0, data=13, model=0): 26, ProcessCoord(pipe=0, data=13, model=1): 27, ProcessCoord(pipe=0, data=14, model=0): 28, ProcessCoord(pipe=0, data=14, model=1): 29, ProcessCoord(pipe=0, data=15, model=0): 30, ProcessCoord(pipe=0, data=15, model=1): 31, ProcessCoord(pipe=0, data=16, model=0): 32, ProcessCoord(pipe=0, data=16, model=1): 33, ProcessCoord(pipe=0, data=17, model=0): 34, ProcessCoord(pipe=0, data=17, model=1): 35, ProcessCoord(pipe=0, data=18, model=0): 36, ProcessCoord(pipe=0, data=18, model=1): 37, ProcessCoord(pipe=0, data=19, model=0): 38, ProcessCoord(pipe=0, data=19, model=1): 39, ProcessCoord(pipe=0, data=20, model=0): 40, ProcessCoord(pipe=0, data=20, model=1): 41, ProcessCoord(pipe=0, data=21, model=0): 42, ProcessCoord(pipe=0, data=21, model=1): 43, ProcessCoord(pipe=0, data=22, model=0): 44, ProcessCoord(pipe=0, data=22, model=1): 45, ProcessCoord(pipe=0, data=23, model=0 + 0: ): 46, ProcessCoord(pipe=0, data=23, model=1): 47, ProcessCoord(pipe=0, data=24, model=0): 48, ProcessCoord(pipe=0, data=24, model=1): 49, ProcessCoord(pipe=0, data=25, model=0): 50, ProcessCoord(pipe=0, data=25, model=1): 51, ProcessCoord(pipe=0, data=26, model=0): 52, ProcessCoord(pipe=0, data=26, model=1): 53, ProcessCoord(pipe=0, data=27, model=0): 54, ProcessCoord(pipe=0, data=27, model=1): 55, ProcessCoord(pipe=0, data=28, model=0): 56, ProcessCoord(pipe=0, data=28, model=1): 57, ProcessCoord(pipe=0, data=29, model=0): 58, ProcessCoord(pipe=0, data=29, model=1): 59, ProcessCoord(pipe=0, data=30, model=0): 60, ProcessCoord(pipe=0, data=30, model=1): 61, ProcessCoord(pipe=0, data=31, model=0): 62, ProcessCoord(pipe=0, data=31, model=1): 63, ProcessCoord(pipe=0, data=32, model=0): 64, ProcessCoord(pipe=0, data=32, model=1): 65, ProcessCoord(pipe=0, data=33, model=0): 66, ProcessCoord(pipe=0, data=33, model=1): 67, ProcessCoord(pipe=0, data=34, model=0): 68, ProcessCoord(pipe=0, data=34, model=1): 69, Proce + 0: ssCoord(pipe=0, data=35, model=0): 70, ProcessCoord(pipe=0, data=35, model=1): 71, ProcessCoord(pipe=0, data=36, model=0): 72, ProcessCoord(pipe=0, data=36, model=1): 73, ProcessCoord(pipe=0, data=37, model=0): 74, ProcessCoord(pipe=0, data=37, model=1): 75, ProcessCoord(pipe=0, data=38, model=0): 76, ProcessCoord(pipe=0, data=38, model=1): 77, ProcessCoord(pipe=0, data=39, model=0): 78, ProcessCoord(pipe=0, data=39, model=1): 79, ProcessCoord(pipe=0, data=40, model=0): 80, ProcessCoord(pipe=0, data=40, model=1): 81, ProcessCoord(pipe=0, data=41, model=0): 82, ProcessCoord(pipe=0, data=41, model=1): 83, ProcessCoord(pipe=0, data=42, model=0): 84, ProcessCoord(pipe=0, data=42, model=1): 85, ProcessCoord(pipe=0, data=43, model=0): 86, ProcessCoord(pipe=0, data=43, model=1): 87, ProcessCoord(pipe=0, data=44, model=0): 88, ProcessCoord(pipe=0, data=44, model=1): 89, ProcessCoord(pipe=0, data=45, model=0): 90, ProcessCoord(pipe=0, data=45, model=1): 91, ProcessCoord(pipe=0, data=46, model=0): 92, ProcessCoord(pipe + 0: =0, data=46, model=1): 93, ProcessCoord(pipe=0, data=47, model=0): 94, ProcessCoord(pipe=0, data=47, model=1): 95, ProcessCoord(pipe=0, data=48, model=0): 96, ProcessCoord(pipe=0, data=48, model=1): 97, ProcessCoord(pipe=0, data=49, model=0): 98, ProcessCoord(pipe=0, data=49, model=1): 99, ProcessCoord(pipe=0, data=50, model=0): 100, ProcessCoord(pipe=0, data=50, model=1): 101, ProcessCoord(pipe=0, data=51, model=0): 102, ProcessCoord(pipe=0, data=51, model=1): 103, ProcessCoord(pipe=0, data=52, model=0): 104, ProcessCoord(pipe=0, data=52, model=1): 105, ProcessCoord(pipe=0, data=53, model=0): 106, ProcessCoord(pipe=0, data=53, model=1): 107, ProcessCoord(pipe=0, data=54, model=0): 108, ProcessCoord(pipe=0, data=54, model=1): 109, ProcessCoord(pipe=0, data=55, model=0): 110, ProcessCoord(pipe=0, data=55, model=1): 111, ProcessCoord(pipe=0, data=56, model=0): 112, ProcessCoord(pipe=0, data=56, model=1): 113, ProcessCoord(pipe=0, data=57, model=0): 114, ProcessCoord(pipe=0, data=57, model=1): 115, ProcessCoord( + 0: pipe=0, data=58, model=0): 116, ProcessCoord(pipe=0, data=58, model=1): 117, ProcessCoord(pipe=0, data=59, model=0): 118, ProcessCoord(pipe=0, data=59, model=1): 119, ProcessCoord(pipe=0, data=60, model=0): 120, ProcessCoord(pipe=0, data=60, model=1): 121, ProcessCoord(pipe=0, data=61, model=0): 122, ProcessCoord(pipe=0, data=61, model=1): 123, ProcessCoord(pipe=0, data=62, model=0): 124, ProcessCoord(pipe=0, data=62, model=1): 125, ProcessCoord(pipe=0, data=63, model=0): 126, ProcessCoord(pipe=0, data=63, model=1): 127, ProcessCoord(pipe=0, data=64, model=0): 128, ProcessCoord(pipe=0, data=64, model=1): 129, ProcessCoord(pipe=0, data=65, model=0): 130, ProcessCoord(pipe=0, data=65, model=1): 131, ProcessCoord(pipe=0, data=66, model=0): 132, ProcessCoord(pipe=0, data=66, model=1): 133, ProcessCoord(pipe=0, data=67, model=0): 134, ProcessCoord(pipe=0, data=67, model=1): 135, ProcessCoord(pipe=0, data=68, model=0): 136, ProcessCoord(pipe=0, data=68, model=1): 137, ProcessCoord(pipe=0, data=69, model=0): 138, Pr + 0: ocessCoord(pipe=0, data=69, model=1): 139, ProcessCoord(pipe=0, data=70, model=0): 140, ProcessCoord(pipe=0, data=70, model=1): 141, ProcessCoord(pipe=0, data=71, model=0): 142, ProcessCoord(pipe=0, data=71, model=1): 143, ProcessCoord(pipe=0, data=72, model=0): 144, ProcessCoord(pipe=0, data=72, model=1): 145, ProcessCoord(pipe=0, data=73, model=0): 146, ProcessCoord(pipe=0, data=73, model=1): 147, ProcessCoord(pipe=0, data=74, model=0): 148, ProcessCoord(pipe=0, data=74, model=1): 149, ProcessCoord(pipe=0, data=75, model=0): 150, ProcessCoord(pipe=0, data=75, model=1): 151, ProcessCoord(pipe=0, data=76, model=0): 152, ProcessCoord(pipe=0, data=76, model=1): 153, ProcessCoord(pipe=0, data=77, model=0): 154, ProcessCoord(pipe=0, data=77, model=1): 155, ProcessCoord(pipe=0, data=78, model=0): 156, ProcessCoord(pipe=0, data=78, model=1): 157, ProcessCoord(pipe=0, data=79, model=0): 158, ProcessCoord(pipe=0, data=79, model=1): 159, ProcessCoord(pipe=0, data=80, model=0): 160, ProcessCoord(pipe=0, data=80, model= + 0: 1): 161, ProcessCoord(pipe=0, data=81, model=0): 162, ProcessCoord(pipe=0, data=81, model=1): 163, ProcessCoord(pipe=0, data=82, model=0): 164, ProcessCoord(pipe=0, data=82, model=1): 165, ProcessCoord(pipe=0, data=83, model=0): 166, ProcessCoord(pipe=0, data=83, model=1): 167, ProcessCoord(pipe=0, data=84, model=0): 168, ProcessCoord(pipe=0, data=84, model=1): 169, ProcessCoord(pipe=0, data=85, model=0): 170, ProcessCoord(pipe=0, data=85, model=1): 171, ProcessCoord(pipe=0, data=86, model=0): 172, ProcessCoord(pipe=0, data=86, model=1): 173, ProcessCoord(pipe=0, data=87, model=0): 174, ProcessCoord(pipe=0, data=87, model=1): 175, ProcessCoord(pipe=0, data=88, model=0): 176, ProcessCoord(pipe=0, data=88, model=1): 177, ProcessCoord(pipe=0, data=89, model=0): 178, ProcessCoord(pipe=0, data=89, model=1): 179, ProcessCoord(pipe=0, data=90, model=0): 180, ProcessCoord(pipe=0, data=90, model=1): 181, ProcessCoord(pipe=0, data=91, model=0): 182, ProcessCoord(pipe=0, data=91, model=1): 183, ProcessCoord(pipe=0, data + 0: =92, model=0): 184, ProcessCoord(pipe=0, data=92, model=1): 185, ProcessCoord(pipe=0, data=93, model=0): 186, ProcessCoord(pipe=0, data=93, model=1): 187, ProcessCoord(pipe=0, data=94, model=0): 188, ProcessCoord(pipe=0, data=94, model=1): 189, ProcessCoord(pipe=0, data=95, model=0): 190, ProcessCoord(pipe=0, data=95, model=1): 191, ProcessCoord(pipe=0, data=96, model=0): 192, ProcessCoord(pipe=0, data=96, model=1): 193, ProcessCoord(pipe=0, data=97, model=0): 194, ProcessCoord(pipe=0, data=97, model=1): 195, ProcessCoord(pipe=0, data=98, model=0): 196, ProcessCoord(pipe=0, data=98, model=1): 197, ProcessCoord(pipe=0, data=99, model=0): 198, ProcessCoord(pipe=0, data=99, model=1): 199, ProcessCoord(pipe=0, data=100, model=0): 200, ProcessCoord(pipe=0, data=100, model=1): 201, ProcessCoord(pipe=0, data=101, model=0): 202, ProcessCoord(pipe=0, data=101, model=1): 203, ProcessCoord(pipe=0, data=102, model=0): 204, ProcessCoord(pipe=0, data=102, model=1): 205, ProcessCoord(pipe=0, data=103, model=0): 206, Process + 0: Coord(pipe=0, data=103, model=1): 207, ProcessCoord(pipe=0, data=104, model=0): 208, ProcessCoord(pipe=0, data=104, model=1): 209, ProcessCoord(pipe=0, data=105, model=0): 210, ProcessCoord(pipe=0, data=105, model=1): 211, ProcessCoord(pipe=0, data=106, model=0): 212, ProcessCoord(pipe=0, data=106, model=1): 213, ProcessCoord(pipe=0, data=107, model=0): 214, ProcessCoord(pipe=0, data=107, model=1): 215, ProcessCoord(pipe=0, data=108, model=0): 216, ProcessCoord(pipe=0, data=108, model=1): 217, ProcessCoord(pipe=0, data=109, model=0): 218, ProcessCoord(pipe=0, data=109, model=1): 219, ProcessCoord(pipe=0, data=110, model=0): 220, ProcessCoord(pipe=0, data=110, model=1): 221, ProcessCoord(pipe=0, data=111, model=0): 222, ProcessCoord(pipe=0, data=111, model=1): 223, ProcessCoord(pipe=0, data=112, model=0): 224, ProcessCoord(pipe=0, data=112, model=1): 225, ProcessCoord(pipe=0, data=113, model=0): 226, ProcessCoord(pipe=0, data=113, model=1): 227, ProcessCoord(pipe=0, data=114, model=0): 228, ProcessCoord(pipe=0 + 0: , data=114, model=1): 229, ProcessCoord(pipe=0, data=115, model=0): 230, ProcessCoord(pipe=0, data=115, model=1): 231, ProcessCoord(pipe=0, data=116, model=0): 232, ProcessCoord(pipe=0, data=116, model=1): 233, ProcessCoord(pipe=0, data=117, model=0): 234, ProcessCoord(pipe=0, data=117, model=1): 235, ProcessCoord(pipe=0, data=118, model=0): 236, ProcessCoord(pipe=0, data=118, model=1): 237, ProcessCoord(pipe=0, data=119, model=0): 238, ProcessCoord(pipe=0, data=119, model=1): 239, ProcessCoord(pipe=0, data=120, model=0): 240, ProcessCoord(pipe=0, data=120, model=1): 241, ProcessCoord(pipe=0, data=121, model=0): 242, ProcessCoord(pipe=0, data=121, model=1): 243, ProcessCoord(pipe=0, data=122, model=0): 244, ProcessCoord(pipe=0, data=122, model=1): 245, ProcessCoord(pipe=0, data=123, model=0): 246, ProcessCoord(pipe=0, data=123, model=1): 247, ProcessCoord(pipe=0, data=124, model=0): 248, ProcessCoord(pipe=0, data=124, model=1): 249, ProcessCoord(pipe=0, data=125, model=0): 250, ProcessCoord(pipe=0, data=125, + 0: model=1): 251, ProcessCoord(pipe=0, data=126, model=0): 252, ProcessCoord(pipe=0, data=126, model=1): 253, ProcessCoord(pipe=0, data=127, model=0): 254, ProcessCoord(pipe=0, data=127, model=1): 255, ProcessCoord(pipe=1, data=0, model=0): 256, ProcessCoord(pipe=1, data=0, model=1): 257, ProcessCoord(pipe=1, data=1, model=0): 258, ProcessCoord(pipe=1, data=1, model=1): 259, ProcessCoord(pipe=1, data=2, model=0): 260, ProcessCoord(pipe=1, data=2, model=1): 261, ProcessCoord(pipe=1, data=3, model=0): 262, ProcessCoord(pipe=1, data=3, model=1): 263, ProcessCoord(pipe=1, data=4, model=0): 264, ProcessCoord(pipe=1, data=4, model=1): 265, ProcessCoord(pipe=1, data=5, model=0): 266, ProcessCoord(pipe=1, data=5, model=1): 267, ProcessCoord(pipe=1, data=6, model=0): 268, ProcessCoord(pipe=1, data=6, model=1): 269, ProcessCoord(pipe=1, data=7, model=0): 270, ProcessCoord(pipe=1, data=7, model=1): 271, ProcessCoord(pipe=1, data=8, model=0): 272, ProcessCoord(pipe=1, data=8, model=1): 273, ProcessCoord(pipe=1, data=9, mode + 0: l=0): 274, ProcessCoord(pipe=1, data=9, model=1): 275, ProcessCoord(pipe=1, data=10, model=0): 276, ProcessCoord(pipe=1, data=10, model=1): 277, ProcessCoord(pipe=1, data=11, model=0): 278, ProcessCoord(pipe=1, data=11, model=1): 279, ProcessCoord(pipe=1, data=12, model=0): 280, ProcessCoord(pipe=1, data=12, model=1): 281, ProcessCoord(pipe=1, data=13, model=0): 282, ProcessCoord(pipe=1, data=13, model=1): 283, ProcessCoord(pipe=1, data=14, model=0): 284, ProcessCoord(pipe=1, data=14, model=1): 285, ProcessCoord(pipe=1, data=15, model=0): 286, ProcessCoord(pipe=1, data=15, model=1): 287, ProcessCoord(pipe=1, data=16, model=0): 288, ProcessCoord(pipe=1, data=16, model=1): 289, ProcessCoord(pipe=1, data=17, model=0): 290, ProcessCoord(pipe=1, data=17, model=1): 291, ProcessCoord(pipe=1, data=18, model=0): 292, ProcessCoord(pipe=1, data=18, model=1): 293, ProcessCoord(pipe=1, data=19, model=0): 294, ProcessCoord(pipe=1, data=19, model=1): 295, ProcessCoord(pipe=1, data=20, model=0): 296, ProcessCoord(pipe=1, dat + 0: a=20, model=1): 297, ProcessCoord(pipe=1, data=21, model=0): 298, ProcessCoord(pipe=1, data=21, model=1): 299, ProcessCoord(pipe=1, data=22, model=0): 300, ProcessCoord(pipe=1, data=22, model=1): 301, ProcessCoord(pipe=1, data=23, model=0): 302, ProcessCoord(pipe=1, data=23, model=1): 303, ProcessCoord(pipe=1, data=24, model=0): 304, ProcessCoord(pipe=1, data=24, model=1): 305, ProcessCoord(pipe=1, data=25, model=0): 306, ProcessCoord(pipe=1, data=25, model=1): 307, ProcessCoord(pipe=1, data=26, model=0): 308, ProcessCoord(pipe=1, data=26, model=1): 309, ProcessCoord(pipe=1, data=27, model=0): 310, ProcessCoord(pipe=1, data=27, model=1): 311, ProcessCoord(pipe=1, data=28, model=0): 312, ProcessCoord(pipe=1, data=28, model=1): 313, ProcessCoord(pipe=1, data=29, model=0): 314, ProcessCoord(pipe=1, data=29, model=1): 315, ProcessCoord(pipe=1, data=30, model=0): 316, ProcessCoord(pipe=1, data=30, model=1): 317, ProcessCoord(pipe=1, data=31, model=0): 318, ProcessCoord(pipe=1, data=31, model=1): 319, ProcessCoord( + 0: pipe=1, data=32, model=0): 320, ProcessCoord(pipe=1, data=32, model=1): 321, ProcessCoord(pipe=1, data=33, model=0): 322, ProcessCoord(pipe=1, data=33, model=1): 323, ProcessCoord(pipe=1, data=34, model=0): 324, ProcessCoord(pipe=1, data=34, model=1): 325, ProcessCoord(pipe=1, data=35, model=0): 326, ProcessCoord(pipe=1, data=35, model=1): 327, ProcessCoord(pipe=1, data=36, model=0): 328, ProcessCoord(pipe=1, data=36, model=1): 329, ProcessCoord(pipe=1, data=37, model=0): 330, ProcessCoord(pipe=1, data=37, model=1): 331, ProcessCoord(pipe=1, data=38, model=0): 332, ProcessCoord(pipe=1, data=38, model=1): 333, ProcessCoord(pipe=1, data=39, model=0): 334, ProcessCoord(pipe=1, data=39, model=1): 335, ProcessCoord(pipe=1, data=40, model=0): 336, ProcessCoord(pipe=1, data=40, model=1): 337, ProcessCoord(pipe=1, data=41, model=0): 338, ProcessCoord(pipe=1, data=41, model=1): 339, ProcessCoord(pipe=1, data=42, model=0): 340, ProcessCoord(pipe=1, data=42, model=1): 341, ProcessCoord(pipe=1, data=43, model=0): 342, Pr + 0: ocessCoord(pipe=1, data=43, model=1): 343, ProcessCoord(pipe=1, data=44, model=0): 344, ProcessCoord(pipe=1, data=44, model=1): 345, ProcessCoord(pipe=1, data=45, model=0): 346, ProcessCoord(pipe=1, data=45, model=1): 347, ProcessCoord(pipe=1, data=46, model=0): 348, ProcessCoord(pipe=1, data=46, model=1): 349, ProcessCoord(pipe=1, data=47, model=0): 350, ProcessCoord(pipe=1, data=47, model=1): 351, ProcessCoord(pipe=1, data=48, model=0): 352, ProcessCoord(pipe=1, data=48, model=1): 353, ProcessCoord(pipe=1, data=49, model=0): 354, ProcessCoord(pipe=1, data=49, model=1): 355, ProcessCoord(pipe=1, data=50, model=0): 356, ProcessCoord(pipe=1, data=50, model=1): 357, ProcessCoord(pipe=1, data=51, model=0): 358, ProcessCoord(pipe=1, data=51, model=1): 359, ProcessCoord(pipe=1, data=52, model=0): 360, ProcessCoord(pipe=1, data=52, model=1): 361, ProcessCoord(pipe=1, data=53, model=0): 362, ProcessCoord(pipe=1, data=53, model=1): 363, ProcessCoord(pipe=1, data=54, model=0): 364, ProcessCoord(pipe=1, data=54, model= + 0: 1): 365, ProcessCoord(pipe=1, data=55, model=0): 366, ProcessCoord(pipe=1, data=55, model=1): 367, ProcessCoord(pipe=1, data=56, model=0): 368, ProcessCoord(pipe=1, data=56, model=1): 369, ProcessCoord(pipe=1, data=57, model=0): 370, ProcessCoord(pipe=1, data=57, model=1): 371, ProcessCoord(pipe=1, data=58, model=0): 372, ProcessCoord(pipe=1, data=58, model=1): 373, ProcessCoord(pipe=1, data=59, model=0): 374, ProcessCoord(pipe=1, data=59, model=1): 375, ProcessCoord(pipe=1, data=60, model=0): 376, ProcessCoord(pipe=1, data=60, model=1): 377, ProcessCoord(pipe=1, data=61, model=0): 378, ProcessCoord(pipe=1, data=61, model=1): 379, ProcessCoord(pipe=1, data=62, model=0): 380, ProcessCoord(pipe=1, data=62, model=1): 381, ProcessCoord(pipe=1, data=63, model=0): 382, ProcessCoord(pipe=1, data=63, model=1): 383, ProcessCoord(pipe=1, data=64, model=0): 384, ProcessCoord(pipe=1, data=64, model=1): 385, ProcessCoord(pipe=1, data=65, model=0): 386, ProcessCoord(pipe=1, data=65, model=1): 387, ProcessCoord(pipe=1, data + 0: =66, model=0): 388, ProcessCoord(pipe=1, data=66, model=1): 389, ProcessCoord(pipe=1, data=67, model=0): 390, ProcessCoord(pipe=1, data=67, model=1): 391, ProcessCoord(pipe=1, data=68, model=0): 392, ProcessCoord(pipe=1, data=68, model=1): 393, ProcessCoord(pipe=1, data=69, model=0): 394, ProcessCoord(pipe=1, data=69, model=1): 395, ProcessCoord(pipe=1, data=70, model=0): 396, ProcessCoord(pipe=1, data=70, model=1): 397, ProcessCoord(pipe=1, data=71, model=0): 398, ProcessCoord(pipe=1, data=71, model=1): 399, ProcessCoord(pipe=1, data=72, model=0): 400, ProcessCoord(pipe=1, data=72, model=1): 401, ProcessCoord(pipe=1, data=73, model=0): 402, ProcessCoord(pipe=1, data=73, model=1): 403, ProcessCoord(pipe=1, data=74, model=0): 404, ProcessCoord(pipe=1, data=74, model=1): 405, ProcessCoord(pipe=1, data=75, model=0): 406, ProcessCoord(pipe=1, data=75, model=1): 407, ProcessCoord(pipe=1, data=76, model=0): 408, ProcessCoord(pipe=1, data=76, model=1): 409, ProcessCoord(pipe=1, data=77, model=0): 410, ProcessCoord(p + 0: ipe=1, data=77, model=1): 411, ProcessCoord(pipe=1, data=78, model=0): 412, ProcessCoord(pipe=1, data=78, model=1): 413, ProcessCoord(pipe=1, data=79, model=0): 414, ProcessCoord(pipe=1, data=79, model=1): 415, ProcessCoord(pipe=1, data=80, model=0): 416, ProcessCoord(pipe=1, data=80, model=1): 417, ProcessCoord(pipe=1, data=81, model=0): 418, ProcessCoord(pipe=1, data=81, model=1): 419, ProcessCoord(pipe=1, data=82, model=0): 420, ProcessCoord(pipe=1, data=82, model=1): 421, ProcessCoord(pipe=1, data=83, model=0): 422, ProcessCoord(pipe=1, data=83, model=1): 423, ProcessCoord(pipe=1, data=84, model=0): 424, ProcessCoord(pipe=1, data=84, model=1): 425, ProcessCoord(pipe=1, data=85, model=0): 426, ProcessCoord(pipe=1, data=85, model=1): 427, ProcessCoord(pipe=1, data=86, model=0): 428, ProcessCoord(pipe=1, data=86, model=1): 429, ProcessCoord(pipe=1, data=87, model=0): 430, ProcessCoord(pipe=1, data=87, model=1): 431, ProcessCoord(pipe=1, data=88, model=0): 432, ProcessCoord(pipe=1, data=88, model=1): 433, Pro + 0: cessCoord(pipe=1, data=89, model=0): 434, ProcessCoord(pipe=1, data=89, model=1): 435, ProcessCoord(pipe=1, data=90, model=0): 436, ProcessCoord(pipe=1, data=90, model=1): 437, ProcessCoord(pipe=1, data=91, model=0): 438, ProcessCoord(pipe=1, data=91, model=1): 439, ProcessCoord(pipe=1, data=92, model=0): 440, ProcessCoord(pipe=1, data=92, model=1): 441, ProcessCoord(pipe=1, data=93, model=0): 442, ProcessCoord(pipe=1, data=93, model=1): 443, ProcessCoord(pipe=1, data=94, model=0): 444, ProcessCoord(pipe=1, data=94, model=1): 445, ProcessCoord(pipe=1, data=95, model=0): 446, ProcessCoord(pipe=1, data=95, model=1): 447, ProcessCoord(pipe=1, data=96, model=0): 448, ProcessCoord(pipe=1, data=96, model=1): 449, ProcessCoord(pipe=1, data=97, model=0): 450, ProcessCoord(pipe=1, data=97, model=1): 451, ProcessCoord(pipe=1, data=98, model=0): 452, ProcessCoord(pipe=1, data=98, model=1): 453, ProcessCoord(pipe=1, data=99, model=0): 454, ProcessCoord(pipe=1, data=99, model=1): 455, ProcessCoord(pipe=1, data=100, model= + 0: 0): 456, ProcessCoord(pipe=1, data=100, model=1): 457, ProcessCoord(pipe=1, data=101, model=0): 458, ProcessCoord(pipe=1, data=101, model=1): 459, ProcessCoord(pipe=1, data=102, model=0): 460, ProcessCoord(pipe=1, data=102, model=1): 461, ProcessCoord(pipe=1, data=103, model=0): 462, ProcessCoord(pipe=1, data=103, model=1): 463, ProcessCoord(pipe=1, data=104, model=0): 464, ProcessCoord(pipe=1, data=104, model=1): 465, ProcessCoord(pipe=1, data=105, model=0): 466, ProcessCoord(pipe=1, data=105, model=1): 467, ProcessCoord(pipe=1, data=106, model=0): 468, ProcessCoord(pipe=1, data=106, model=1): 469, ProcessCoord(pipe=1, data=107, model=0): 470, ProcessCoord(pipe=1, data=107, model=1): 471, ProcessCoord(pipe=1, data=108, model=0): 472, ProcessCoord(pipe=1, data=108, model=1): 473, ProcessCoord(pipe=1, data=109, model=0): 474, ProcessCoord(pipe=1, data=109, model=1): 475, ProcessCoord(pipe=1, data=110, model=0): 476, ProcessCoord(pipe=1, data=110, model=1): 477, ProcessCoord(pipe=1, data=111, model=0): 478, Pro + 0: cessCoord(pipe=1, data=111, model=1): 479, ProcessCoord(pipe=1, data=112, model=0): 480, ProcessCoord(pipe=1, data=112, model=1): 481, ProcessCoord(pipe=1, data=113, model=0): 482, ProcessCoord(pipe=1, data=113, model=1): 483, ProcessCoord(pipe=1, data=114, model=0): 484, ProcessCoord(pipe=1, data=114, model=1): 485, ProcessCoord(pipe=1, data=115, model=0): 486, ProcessCoord(pipe=1, data=115, model=1): 487, ProcessCoord(pipe=1, data=116, model=0): 488, ProcessCoord(pipe=1, data=116, model=1): 489, ProcessCoord(pipe=1, data=117, model=0): 490, ProcessCoord(pipe=1, data=117, model=1): 491, ProcessCoord(pipe=1, data=118, model=0): 492, ProcessCoord(pipe=1, data=118, model=1): 493, ProcessCoord(pipe=1, data=119, model=0): 494, ProcessCoord(pipe=1, data=119, model=1): 495, ProcessCoord(pipe=1, data=120, model=0): 496, ProcessCoord(pipe=1, data=120, model=1): 497, ProcessCoord(pipe=1, data=121, model=0): 498, ProcessCoord(pipe=1, data=121, model=1): 499, ProcessCoord(pipe=1, data=122, model=0): 500, ProcessCoord(pi + 0: pe=1, data=122, model=1): 501, ProcessCoord(pipe=1, data=123, model=0): 502, ProcessCoord(pipe=1, data=123, model=1): 503, ProcessCoord(pipe=1, data=124, model=0): 504, ProcessCoord(pipe=1, data=124, model=1): 505, ProcessCoord(pipe=1, data=125, model=0): 506, ProcessCoord(pipe=1, data=125, model=1): 507, ProcessCoord(pipe=1, data=126, model=0): 508, ProcessCoord(pipe=1, data=126, model=1): 509, ProcessCoord(pipe=1, data=127, model=0): 510, ProcessCoord(pipe=1, data=127, model=1): 511} + 0: [2022-12-01 18:26:44,751] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer + 0: stage=0 layers=24 + 0: 0: _to_float16 + 0: 1: EmbeddingPipe + 0: 2: + 0: 3: ParallelTransformerLayerPipe + 0: 4: ParallelTransformerLayerPipe + 0: 5: ParallelTransformerLayerPipe + 0: 6: ParallelTransformerLayerPipe + 0: 7: ParallelTransformerLayerPipe + 0: 8: ParallelTransformerLayerPipe + 0: 9: ParallelTransformerLayerPipe + 0: 10: ParallelTransformerLayerPipe + 0: 11: ParallelTransformerLayerPipe + 0: 12: ParallelTransformerLayerPipe + 0: 13: ParallelTransformerLayerPipe + 0: 14: ParallelTransformerLayerPipe + 0: 15: ParallelTransformerLayerPipe + 0: 16: ParallelTransformerLayerPipe + 0: 17: ParallelTransformerLayerPipe + 0: 18: ParallelTransformerLayerPipe + 0: 19: ParallelTransformerLayerPipe + 0: 20: ParallelTransformerLayerPipe + 0: 21: ParallelTransformerLayerPipe + 0: 22: ParallelTransformerLayerPipe + 0: 23: ParallelTransformerLayerPipe + 0: stage=1 layers=25 + 0: 24: ParallelTransformerLayerPipe + 0: 25: ParallelTransformerLayerPipe + 0: 26: ParallelTransformerLayerPipe + 0: 27: ParallelTransformerLayerPipe + 0: 28: ParallelTransformerLayerPipe + 0: 29: ParallelTransformerLayerPipe + 0: 30: ParallelTransformerLayerPipe + 0: 31: ParallelTransformerLayerPipe + 0: 32: ParallelTransformerLayerPipe + 0: 33: ParallelTransformerLayerPipe + 0: 34: ParallelTransformerLayerPipe + 0: 35: ParallelTransformerLayerPipe + 0: 36: ParallelTransformerLayerPipe + 0: 37: ParallelTransformerLayerPipe + 0: 38: ParallelTransformerLayerPipe + 0: 39: ParallelTransformerLayerPipe + 0: 40: ParallelTransformerLayerPipe + 0: 41: ParallelTransformerLayerPipe + 0: 42: ParallelTransformerLayerPipe + 0: 43: ParallelTransformerLayerPipe + 0: 44: ParallelTransformerLayerPipe + 0: 45: undo + 0: 46: MixedFusedLayerNorm + 0: 47: EmbeddingPipe + 0: 48: float16_to_fp32 + 0: loss: CrossEntropy + 0: [2022-12-01 18:26:50,892] [INFO] [utils.py:827:see_memory_usage] After Building Model + 0: [2022-12-01 18:26:50,892] [INFO] [utils.py:828:see_memory_usage] MA 4.16 GB Max_MA 4.16 GB CA 4.17 GB Max_CA 4 GB + 0: [2022-12-01 18:26:50,893] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.04 GB, percent = 6.0% + 0: setting training iterations to 5494 + 0: > learning rate decay style: cosine + 0: DeepSpeed is enabled. + 0: [2022-12-01 18:26:50,895] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown + 0: [2022-12-01 18:26:59,431] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False + 0: [2022-12-01 18:26:59,432] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer + 0: [2022-12-01 18:26:59,432] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer + 0: [2022-12-01 18:26:59,440] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam + 0: [2022-12-01 18:26:59,440] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer + 0: [2022-12-01 18:26:59,486] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer + 0: [2022-12-01 18:26:59,486] [INFO] [utils.py:828:see_memory_usage] MA 4.15 GB Max_MA 4.18 GB CA 4.18 GB Max_CA 4 GB + 0: [2022-12-01 18:26:59,486] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.9 GB, percent = 6.1% +26: ninja: no work to do. +26: Time to load utils op: 0.4061152935028076 seconds +26: Time to load utils op: 0.4353678226470947 seconds +17: Time to load utils op: 0.4074232578277588 seconds +17: Time to load utils op: 0.40746164321899414 seconds +18: Time to load utils op: 0.4065840244293213 seconds +18: Time to load utils op: 0.4066622257232666 seconds +17: Time to load utils op: 0.4074733257293701 seconds +36: ninja: no work to do. +36: Time to load utils op: 0.19267630577087402 seconds +34: Time to load utils op: 0.4169273376464844 seconds +35: Time to load utils op: 0.4173858165740967 seconds + 0: Time to load utils op: 0.3098280429840088 seconds + 0: Time to load utils op: 0.5108437538146973 secondsTime to load utils op: 0.5105702877044678 seconds + 0: Time to load utils op: 0.5107874870300293 seconds + 0: +55: Time to load utils op: 0.40985608100891113 seconds +59: Time to load utils op: 0.40899658203125 seconds +35: Time to load utils op: 0.20209097862243652 seconds + 2: Time to load utils op: 0.5115163326263428 seconds + 2: Time to load utils op: 0.5115461349487305 secondsTime to load utils op: 0.5115451812744141 seconds + 2: + 2: Time to load utils op: 0.5115530490875244 seconds + 4: Time to load utils op: 0.5107347965240479 secondsTime to load utils op: 0.5107355117797852 seconds + 4: + 4: Time to load utils op: 0.5107431411743164 secondsTime to load utils op: 0.5107426643371582 seconds + 4: + 1: Time to load utils op: 0.5121989250183105 secondsTime to load utils op: 0.5122015476226807 seconds + 1: + 1: Time to load utils op: 0.5122284889221191 seconds + 1: Time to load utils op: 0.512237548828125 seconds + 5: Time to load utils op: 0.5111496448516846 seconds + 5: Time to load utils op: 0.5111575126647949 seconds + 5: Time to load utils op: 0.5111563205718994 seconds + 5: Time to load utils op: 0.5111634731292725 seconds + 3: Time to load utils op: 0.5117971897125244 seconds + 3: Time to load utils op: 0.511817455291748 seconds + 3: Time to load utils op: 0.5118188858032227 seconds + 3: Time to load utils op: 0.5118274688720703 seconds + 7: Time to load utils op: 0.5109035968780518 secondsTime to load utils op: 0.5108966827392578 seconds + 7: + 7: Time to load utils op: 0.5109124183654785 seconds + 7: Time to load utils op: 0.510941743850708 seconds +36: Time to load utils op: 0.20364046096801758 seconds +17: Time to load utils op: 0.5058157444000244 seconds +18: Time to load utils op: 0.5056490898132324 seconds +18: Time to load utils op: 0.5058152675628662 seconds + 6: Time to load utils op: 0.5120677947998047 seconds + 8: Time to load utils op: 0.5114471912384033 secondsTime to load utils op: 0.511448860168457 seconds + 8: + 6: Time to load utils op: 0.5120823383331299 secondsTime to load utils op: 0.5120837688446045 seconds + 6: + 6: Time to load utils op: 0.5120961666107178 seconds + 8: Time to load utils op: 0.5114624500274658 secondsTime to load utils op: 0.5114579200744629 seconds + 8: +36: Time to load utils op: 0.20458197593688965 seconds +15: Time to load utils op: 0.5126969814300537 seconds +10: Time to load utils op: 0.5112183094024658 seconds +15: Time to load utils op: 0.5127384662628174 seconds +15: Time to load utils op: 0.5126557350158691 seconds +15: Time to load utils op: 0.5122737884521484 seconds +10: Time to load utils op: 0.5112559795379639 secondsTime to load utils op: 0.5112481117248535 seconds +10: +10: Time to load utils op: 0.5112662315368652 seconds +35: Time to load utils op: 0.2060708999633789 secondsTime to load utils op: 0.20590925216674805 seconds +35: +35: Time to load utils op: 0.20657658576965332 seconds + 9: Time to load utils op: 0.5121164321899414 seconds + 9: Time to load utils op: 0.5121359825134277 seconds + 9: Time to load utils op: 0.512143611907959 secondsTime to load utils op: 0.5121469497680664 seconds + 9: +34: Time to load utils op: 0.20704317092895508 seconds +32: Time to load utils op: 0.20858526229858398 seconds +32: Time to load utils op: 0.40981507301330566 seconds +32: Time to load utils op: 0.20812773704528809 seconds +32: Time to load utils op: 0.20845818519592285 seconds +34: Time to load utils op: 0.20682454109191895 seconds +33: Time to load utils op: 0.20805811882019043 seconds +33: Time to load utils op: 0.20775437355041504 seconds +33: Time to load utils op: 0.20819878578186035 seconds +33: Time to load utils op: 0.20717120170593262 seconds + 0: Time to load utils op: 0.30246949195861816 seconds +34: Time to load utils op: 0.20688223838806152 seconds +34: Time to load utils op: 0.20396161079406738 seconds +19: Time to load utils op: 0.5117721557617188 seconds +19: Time to load utils op: 0.5110478401184082 seconds +19: Time to load utils op: 0.5116691589355469 seconds +19: Time to load utils op: 0.5121729373931885 seconds +11: Time to load utils op: 0.5117900371551514 secondsTime to load utils op: 0.511791467666626 seconds +11: + 0: Time to load utils op: 0.30316853523254395 seconds +11: Time to load utils op: 0.5118036270141602 seconds +11: Time to load utils op: 0.5118100643157959 seconds + 1: Time to load utils op: 0.302570104598999 seconds + 1: Time to load utils op: 0.30350518226623535 secondsTime to load utils op: 0.30324840545654297 seconds + 1: + 2: Time to load utils op: 0.3024933338165283 seconds + 3: Time to load utils op: 0.30257630348205566 seconds + 2: Time to load utils op: 0.3028371334075928 seconds + 3: Time to load utils op: 0.30299973487854004 seconds + 2: Time to load utils op: 0.3032212257385254 secondsTime to load utils op: 0.3032059669494629 seconds + 2: + 3: Time to load utils op: 0.30333375930786133 secondsTime to load utils op: 0.30333399772644043 seconds + 3: + 4: Time to load utils op: 0.3031179904937744 secondsTime to load utils op: 0.3030354976654053 seconds + 4: + 5: Time to load utils op: 0.30298638343811035 seconds + 6: Time to load utils op: 0.30268168449401855 seconds + 7: Time to load utils op: 0.30239439010620117 seconds + 5: Time to load utils op: 0.30304503440856934 seconds + 6: Time to load utils op: 0.30266880989074707 seconds + 4: Time to load utils op: 0.30332064628601074 seconds + 5: Time to load utils op: 0.3032560348510742 seconds + 7: Time to load utils op: 0.3027055263519287 seconds + 7: Time to load utils op: 0.30258941650390625 seconds + 6: Time to load utils op: 0.302776575088501 seconds + 9: Time to load utils op: 0.30317211151123047 seconds + 8: Time to load utils op: 0.3033561706542969 seconds + 9: Time to load utils op: 0.30322790145874023 seconds + 8: Time to load utils op: 0.3032228946685791 seconds + 8: Time to load utils op: 0.3032956123352051 seconds +10: Time to load utils op: 0.30240869522094727 seconds + 9: Time to load utils op: 0.30359816551208496 seconds +55: Time to load utils op: 0.20299911499023438 seconds + 8: Time to load utils op: 0.30361270904541016 seconds +10: Time to load utils op: 0.3028557300567627 seconds +55: Time to load utils op: 0.2032628059387207 seconds +10: Time to load utils op: 0.30325889587402344 seconds +55: Time to load utils op: 0.20315837860107422 seconds +55: Time to load utils op: 0.20334744453430176 seconds +10: Time to load utils op: 0.30326342582702637 seconds +11: Time to load utils op: 0.3026740550994873 seconds +11: Time to load utils op: 0.3022744655609131 seconds +11: Time to load utils op: 0.30237531661987305 seconds +11: Time to load utils op: 0.30277132987976074 seconds +59: Time to load utils op: 0.20257282257080078 seconds +59: Time to load utils op: 0.20219898223876953 seconds +15: Time to load utils op: 0.3027205467224121 seconds +59: Time to load utils op: 0.20339560508728027 seconds +15: Time to load utils op: 0.30310726165771484 seconds +59: Time to load utils op: 0.2024519443511963 seconds +15: Time to load utils op: 0.3032083511352539 seconds +15: Time to load utils op: 0.3030273914337158 seconds +26: Time to load utils op: 0.3029344081878662 seconds +26: Time to load utils op: 0.3030095100402832 seconds + 0: Time to load utils op: 0.5027129650115967 seconds +17: Time to load utils op: 0.3028888702392578 seconds +19: Time to load utils op: 0.3025491237640381 seconds +17: Time to load utils op: 0.3029513359069824 seconds +18: Time to load utils op: 0.30294036865234375 seconds +17: Time to load utils op: 0.3031284809112549 seconds +18: Time to load utils op: 0.3029148578643799 seconds +18: Time to load utils op: 0.3030586242675781 seconds +19: Time to load utils op: 0.30324387550354004 seconds +19: Time to load utils op: 0.3031766414642334 seconds +19: Time to load utils op: 0.3036465644836426 seconds +26: Time to load utils op: 0.30237603187561035 seconds +26: Time to load utils op: 0.30225658416748047 seconds +26: Time to load utils op: 0.3023834228515625 seconds +26: Time to load utils op: 0.30240464210510254 seconds +12: Time to load utils op: 0.5318889617919922 secondsTime to load utils op: 0.31612443923950195 secondsTime to load utils op: 0.5319018363952637 seconds +12: +12: +12: Time to load utils op: 0.31626391410827637 seconds +12: Time to load utils op: 0.31611084938049316 seconds +12: Time to load utils op: 0.5319156646728516 secondsTime to load utils op: 0.5319302082061768 seconds +12: +32: Time to load utils op: 0.4029223918914795 seconds +32: Time to load utils op: 0.4030935764312744 seconds +32: Time to load utils op: 0.4035964012145996 seconds +34: Time to load utils op: 0.4026219844818115 seconds +35: Time to load utils op: 0.40264153480529785 seconds +34: Time to load utils op: 0.4027280807495117 seconds +35: Time to load utils op: 0.4026782512664795 seconds +34: Time to load utils op: 0.40287351608276367 seconds +35: Time to load utils op: 0.4030771255493164 seconds +33: Time to load utils op: 0.40462207794189453 seconds +33: Time to load utils op: 0.4044513702392578 seconds +33: Time to load utils op: 0.4048349857330322 seconds +33: Time to load utils op: 0.4050459861755371 seconds +36: Time to load utils op: 0.4051802158355713 seconds +36: Time to load utils op: 0.40523767471313477 seconds +36: Time to load utils op: 0.4053013324737549 seconds +36: Time to load utils op: 0.40540313720703125 seconds +55: Time to load utils op: 0.4028754234313965 seconds +55: Time to load utils op: 0.40293192863464355 seconds +55: Time to load utils op: 0.40294671058654785 seconds +59: Time to load utils op: 0.4028007984161377 seconds +36: Time to load utils op: 0.3029630184173584 seconds +59: Time to load utils op: 0.40323424339294434 seconds +59: Time to load utils op: 0.40325474739074707 seconds + 0: Time to load utils op: 0.4026925563812256 seconds + 1: Time to load utils op: 0.4028594493865967 seconds + 4: Time to load utils op: 0.402878999710083 seconds + 5: Time to load utils op: 0.40290069580078125 seconds + 6: Time to load utils op: 0.4032449722290039 seconds + 7: Time to load utils op: 0.4029381275177002 seconds + 0: [2022-12-01 18:27:00,027] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 + 0: [2022-12-01 18:27:00,028] [INFO] [utils.py:828:see_memory_usage] MA 4.15 GB Max_MA 4.15 GB CA 4.18 GB Max_CA 4 GB + 0: [2022-12-01 18:27:00,028] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.9 GB, percent = 6.1% + 9: Time to load utils op: 0.40301513671875 seconds +12: Time to load utils op: 0.40254998207092285 seconds +17: Time to load utils op: 0.4025084972381592 seconds +18: Time to load utils op: 0.40265774726867676 seconds +14: Time to load utils op: 0.3486514091491699 seconds +14: Time to load utils op: 0.5639142990112305 seconds +14: Time to load utils op: 0.3487551212310791 seconds +14: Time to load utils op: 0.5639770030975342 secondsTime to load utils op: 0.5639688968658447 seconds +14: +14: Time to load utils op: 0.3487253189086914 seconds +14: Time to load utils op: 0.407550573348999 seconds +14: Time to load utils op: 0.5639901161193848 seconds +32: Time to load utils op: 0.30214905738830566 seconds +46: Time to load utils op: 0.30794405937194824 seconds +46: Time to load utils op: 0.30797338485717773 seconds +46: Time to load utils op: 0.30792760848999023 seconds +46: Time to load utils op: 0.4749164581298828 secondsTime to load utils op: 0.47482943534851074 seconds +46: +46: Time to load utils op: 0.4749312400817871 seconds +46: Time to load utils op: 0.4749288558959961 secondsTime to load utils op: 0.30816650390625 seconds +46: +38: Time to load utils op: 0.37052106857299805 secondsTime to load utils op: 0.31333303451538086 secondsTime to load utils op: 0.31339526176452637 seconds +38: +38: +38: Time to load utils op: 0.4797544479370117 seconds +16: Time to load utils op: 0.40169286727905273 secondsTime to load utils op: 0.40187621116638184 seconds +16: +38: Time to load utils op: 0.4797635078430176 secondsTime to load utils op: 0.31333065032958984 secondsTime to load utils op: 0.4797680377960205 seconds +38: +38: +38: Time to load utils op: 0.47977709770202637 seconds +16: Time to load utils op: 0.40167880058288574 seconds +16: Time to load utils op: 0.6173946857452393 seconds +16: Time to load utils op: 0.6174030303955078 secondsTime to load utils op: 0.617405891418457 seconds +16: +16: Time to load utils op: 0.6174061298370361 seconds +16: Time to load utils op: 0.4604775905609131 seconds +54: Time to load utils op: 0.3194901943206787 seconds +54: Time to load utils op: 0.31978416442871094 seconds +54: Time to load utils op: 0.48526954650878906 seconds +54: Time to load utils op: 0.48528218269348145 seconds +54: Time to load utils op: 0.4856076240539551 seconds +54: Time to load utils op: 0.3200669288635254 seconds +54: Time to load utils op: 0.4856455326080322 secondsTime to load utils op: 0.32007646560668945 seconds +54: +42: Time to load utils op: 0.33477187156677246 secondsTime to load utils op: 0.33489990234375 seconds +42: +42: Time to load utils op: 0.3911900520324707 seconds +42: Time to load utils op: 0.5012214183807373 seconds +42: Time to load utils op: 0.5017037391662598 secondsTime to load utils op: 0.33547282218933105 seconds +42: Time to load utils op: 0.5017197132110596 secondsTime to load utils op: 0.501716136932373 seconds +42: +42: +50: Time to load utils op: 0.34752702713012695 seconds +50: Time to load utils op: 0.34732842445373535 seconds +50: Time to load utils op: 0.4040706157684326 seconds +50: Time to load utils op: 0.5135307312011719 seconds +50: Time to load utils op: 0.5144646167755127 seconds +50: Time to load utils op: 0.5144758224487305 seconds +50: Time to load utils op: 0.5144989490509033 seconds +50: Time to load utils op: 0.34853529930114746 seconds +48: Time to load utils op: 0.5230996608734131 seconds +48: Time to load utils op: 0.5231211185455322 seconds +48: Time to load utils op: 0.3563268184661865 seconds +48: Time to load utils op: 0.4142415523529053 seconds +48: Time to load utils op: 0.35668015480041504 seconds +48: Time to load utils op: 0.35686254501342773 seconds +48: Time to load utils op: 0.5235443115234375 secondsTime to load utils op: 0.5235397815704346 seconds +48: +53: Time to load utils op: 0.5205364227294922 seconds +53: Time to load utils op: 0.5205416679382324 seconds +53: Time to load utils op: 0.4111950397491455 secondsTime to load utils op: 0.35495758056640625 seconds +53: +53: Time to load utils op: 0.35497117042541504 seconds +53: Time to load utils op: 0.35494446754455566 secondsTime to load utils op: 0.5205676555633545 seconds +53: +53: Time to load utils op: 0.520580530166626 seconds +30: Time to load utils op: 0.44853878021240234 seconds +30: Time to load utils op: 0.44857239723205566 seconds +30: Time to load utils op: 0.44861674308776855 seconds +30: Time to load utils op: 0.44420337677001953 seconds +30: Time to load utils op: 0.4486548900604248 seconds +30: Time to load utils op: 0.44390177726745605 seconds +30: Time to load utils op: 0.44458794593811035 seconds +30: Time to load utils op: 0.49866795539855957 seconds +25: Time to load utils op: 0.6627976894378662 seconds +25: Time to load utils op: 0.6628119945526123 seconds +25: Time to load utils op: 0.4468402862548828 seconds +25: Time to load utils op: 0.44709181785583496 secondsTime to load utils op: 0.446852445602417 seconds +25: +25: Time to load utils op: 0.6628880500793457 seconds +25: Time to load utils op: 0.5062129497528076 seconds +25: Time to load utils op: 0.6629428863525391 seconds +44: Time to load utils op: 0.36159801483154297 seconds +44: Time to load utils op: 0.3616178035736084 seconds +44: Time to load utils op: 0.528231143951416 seconds +44: Time to load utils op: 0.4180734157562256 seconds +44: Time to load utils op: 0.5286977291107178 seconds +44: Time to load utils op: 0.528702974319458 secondsTime to load utils op: 0.5286900997161865 seconds +44: +44: Time to load utils op: 0.36214423179626465 seconds +43: Time to load utils op: 0.5294363498687744 seconds +43: Time to load utils op: 0.3633730411529541 seconds +43: Time to load utils op: 0.3637051582336426 secondsTime to load utils op: 0.5297257900238037 seconds +43: +43: Time to load utils op: 0.5297420024871826 seconds +43: Time to load utils op: 0.36372947692871094 seconds +43: Time to load utils op: 0.4204442501068115 secondsTime to load utils op: 0.5299355983734131 seconds +43: +57: Time to load utils op: 0.3563539981842041 seconds +57: Time to load utils op: 0.5224778652191162 seconds +57: Time to load utils op: 0.5224602222442627 seconds +57: Time to load utils op: 0.41393041610717773 seconds +57: Time to load utils op: 0.522965669631958 secondsTime to load utils op: 0.35683178901672363 seconds +57: +57: Time to load utils op: 0.35687804222106934 seconds +57: Time to load utils op: 0.5232667922973633 seconds +20: Time to load utils op: 0.45284080505371094 seconds +20: Time to load utils op: 0.669419527053833 seconds +51: Time to load utils op: 0.3605077266693115 secondsTime to load utils op: 0.36051464080810547 seconds +51: +20: Time to load utils op: 0.6694316864013672 seconds +20: Time to load utils op: 0.45313310623168945 seconds +20: Time to load utils op: 0.45337581634521484 secondsTime to load utils op: 0.6694765090942383 seconds +20: +20: Time to load utils op: 0.4532613754272461 seconds +20: Time to load utils op: 0.6695153713226318 seconds +51: Time to load utils op: 0.4172401428222656 seconds +51: Time to load utils op: 0.5276203155517578 seconds +51: Time to load utils op: 0.5276541709899902 seconds +51: Time to load utils op: 0.3610069751739502 seconds +51: Time to load utils op: 0.5276954174041748 seconds +51: Time to load utils op: 0.527428150177002 seconds +23: Time to load utils op: 0.6690843105316162 secondsTime to load utils op: 0.6690797805786133 seconds +23: +52: Time to load utils op: 0.3611917495727539 seconds +52: Time to load utils op: 0.3618490695953369 seconds +52: Time to load utils op: 0.36156249046325684 secondsTime to load utils op: 0.5275740623474121 seconds +52: +23: Time to load utils op: 0.4534773826599121 seconds +23: Time to load utils op: 0.6691584587097168 seconds +23: Time to load utils op: 0.4534173011779785 secondsTime to load utils op: 0.669170618057251 secondsTime to load utils op: 0.4531893730163574 seconds +23: +23: +23: Time to load utils op: 0.4532008171081543 seconds +52: Time to load utils op: 0.36185550689697266 secondsTime to load utils op: 0.5280859470367432 secondsTime to load utils op: 0.5261743068695068 seconds +52: +52: Time to load utils op: 0.5280745029449463 seconds +52: +49: Time to load utils op: 0.5343875885009766 seconds +49: Time to load utils op: 0.5344622135162354 seconds +49: Time to load utils op: 0.533461332321167 seconds +49: Time to load utils op: 0.5342497825622559 seconds +49: Time to load utils op: 0.364574670791626 seconds +49: Time to load utils op: 0.36444807052612305 seconds +49: Time to load utils op: 0.36400794982910156 seconds +49: Time to load utils op: 0.421734094619751 seconds +21: Time to load utils op: 0.4550204277038574 seconds +21: Time to load utils op: 0.6711523532867432 seconds +21: Time to load utils op: 0.6711666584014893 seconds +21: Time to load utils op: 0.4552795886993408 seconds +21: Time to load utils op: 0.6712057590484619 seconds +21: Time to load utils op: 0.6712100505828857 seconds +21: Time to load utils op: 0.45526719093322754 secondsTime to load utils op: 0.4549751281738281 seconds +21: +45: Time to load utils op: 0.5318267345428467 seconds +45: Time to load utils op: 0.42455172538757324 seconds +45: Time to load utils op: 0.36714649200439453 seconds +45: Time to load utils op: 0.5331859588623047 seconds +45: Time to load utils op: 0.5332000255584717 seconds +45: Time to load utils op: 0.36748480796813965 seconds +45: Time to load utils op: 0.36763429641723633 seconds +45: Time to load utils op: 0.5332298278808594 seconds +41: Time to load utils op: 0.5351376533508301 seconds +41: Time to load utils op: 0.36921167373657227 seconds +41: Time to load utils op: 0.5351662635803223 secondsTime to load utils op: 0.3692502975463867 seconds +41: +41: Time to load utils op: 0.5356383323669434 seconds +41: Time to load utils op: 0.36975812911987305 seconds +41: Time to load utils op: 0.42629098892211914 seconds +41: Time to load utils op: 0.5356709957122803 seconds +58: Time to load utils op: 0.5258786678314209 seconds +58: Time to load utils op: 0.5259027481079102 seconds +58: Time to load utils op: 0.35974860191345215 seconds +58: Time to load utils op: 0.5259919166564941 secondsTime to load utils op: 0.525998592376709 seconds +58: +28: Time to load utils op: 0.4531841278076172 secondsTime to load utils op: 0.45308876037597656 seconds +28: +58: Time to load utils op: 0.36041855812072754 secondsTime to load utils op: 0.36019158363342285 seconds +58: +58: Time to load utils op: 0.3602144718170166 seconds +28: Time to load utils op: 0.4551520347595215 seconds +28: Time to load utils op: 0.4531543254852295 seconds +28: Time to load utils op: 0.4551870822906494 secondsTime to load utils op: 0.45313000679016113 seconds +28: +28: Time to load utils op: 0.45519042015075684 seconds +28: Time to load utils op: 0.45524024963378906 seconds +13: Time to load utils op: 0.6773450374603271 seconds +13: Time to load utils op: 0.6773629188537598 seconds +27: Time to load utils op: 0.45551156997680664 seconds +13: Time to load utils op: 0.4622843265533447 secondsTime to load utils op: 0.4620070457458496 seconds +13: +27: Time to load utils op: 0.45485734939575195 seconds +13: Time to load utils op: 0.46207714080810547 seconds +27: Time to load utils op: 0.45978236198425293 seconds +13: Time to load utils op: 0.6774313449859619 seconds +27: Time to load utils op: 0.459714412689209 seconds +13: Time to load utils op: 0.6774375438690186 secondsTime to load utils op: 0.5202085971832275 seconds +13: +27: Time to load utils op: 0.45514988899230957 secondsTime to load utils op: 0.45479869842529297 seconds +27: +27: Time to load utils op: 0.7036011219024658 seconds +29: Time to load utils op: 0.4587850570678711 seconds +27: Time to load utils op: 0.45975708961486816 seconds +47: Time to load utils op: 0.5380399227142334 secondsTime to load utils op: 0.3677966594696045 seconds +47: +47: Time to load utils op: 0.42544984817504883 seconds +47: Time to load utils op: 0.5389485359191895 seconds +47: Time to load utils op: 0.5387074947357178 seconds +29: Time to load utils op: 0.4588308334350586 seconds +29: Time to load utils op: 0.45422840118408203 seconds +29: Time to load utils op: 0.4542710781097412 seconds +29: Time to load utils op: 0.45893287658691406 seconds +29: Time to load utils op: 0.4588751792907715 seconds +29: Time to load utils op: 0.4541928768157959 seconds +29: Time to load utils op: 0.5092830657958984 seconds +47: Time to load utils op: 0.36893749237060547 seconds +47: Time to load utils op: 0.36894679069519043 seconds +47: Time to load utils op: 0.5382156372070312 seconds +56: Time to load utils op: 0.5286083221435547 seconds +56: Time to load utils op: 0.36307787895202637 seconds +56: Time to load utils op: 0.3630232810974121 seconds +56: Time to load utils op: 0.36362433433532715 seconds +56: Time to load utils op: 0.5291032791137695 seconds +56: Time to load utils op: 0.3636476993560791 seconds +56: Time to load utils op: 0.5291204452514648 seconds +56: Time to load utils op: 0.5291481018066406 seconds +61: Time to load utils op: 0.5271048545837402 seconds +61: Time to load utils op: 0.36043381690979004 seconds +61: Time to load utils op: 0.5271477699279785 secondsTime to load utils op: 0.36064910888671875 seconds +61: +22: Time to load utils op: 0.4583616256713867 secondsTime to load utils op: 0.4584205150604248 seconds +22: +61: Time to load utils op: 0.5276260375976562 seconds +61: Time to load utils op: 0.5276389122009277 seconds +61: Time to load utils op: 0.3609747886657715 secondsTime to load utils op: 0.3611025810241699 seconds +61: +22: Time to load utils op: 0.4584026336669922 seconds +22: Time to load utils op: 0.6743202209472656 seconds +22: Time to load utils op: 0.6743302345275879 seconds +22: Time to load utils op: 0.5170774459838867 secondsTime to load utils op: 0.674351692199707 seconds +22: +22: Time to load utils op: 0.6743626594543457 seconds +60: Time to load utils op: 0.3620340824127197 seconds +60: Time to load utils op: 0.41884422302246094 seconds +60: Time to load utils op: 0.5280470848083496 secondsTime to load utils op: 0.3617687225341797 seconds +60: +60: Time to load utils op: 0.3622419834136963 seconds +60: Time to load utils op: 0.5285050868988037 secondsTime to load utils op: 0.5284891128540039 seconds +60: +60: Time to load utils op: 0.5285141468048096 seconds +37: Time to load utils op: 0.3754758834838867 secondsTime to load utils op: 0.3759801387786865 seconds +37: +63: Time to load utils op: 0.3614377975463867 seconds +63: Time to load utils op: 0.5313310623168945 seconds +63: Time to load utils op: 0.4188687801361084 seconds +63: Time to load utils op: 0.5312671661376953 seconds +37: Time to load utils op: 0.5462753772735596 seconds +37: Time to load utils op: 0.4329094886779785 secondsTime to load utils op: 0.5458829402923584 seconds +37: +37: Time to load utils op: 0.5463159084320068 secondsTime to load utils op: 0.3759458065032959 seconds +37: +37: Time to load utils op: 0.5459194183349609 seconds +40: Time to load utils op: 0.540229320526123 seconds +63: Time to load utils op: 0.3620872497558594 seconds +63: Time to load utils op: 0.5315089225769043 secondsTime to load utils op: 0.3613605499267578 seconds +63: +40: Time to load utils op: 0.5402262210845947 seconds +40: Time to load utils op: 0.3736560344696045 seconds +63: Time to load utils op: 0.5322322845458984 seconds +40: Time to load utils op: 0.43082189559936523 seconds +40: Time to load utils op: 0.37375378608703613 seconds +40: Time to load utils op: 0.5403323173522949 seconds +40: Time to load utils op: 0.3736274242401123 seconds +40: Time to load utils op: 0.5403516292572021 seconds +62: Time to load utils op: 0.5283043384552002 secondsTime to load utils op: 0.5283091068267822 seconds +62: +62: Time to load utils op: 0.3617732524871826 seconds +24: Time to load utils op: 0.6750171184539795 seconds +62: Time to load utils op: 0.3621547222137451 secondsTime to load utils op: 0.36238598823547363 seconds +62: +62: Time to load utils op: 0.3625967502593994 seconds +62: Time to load utils op: 0.5287399291992188 seconds +62: Time to load utils op: 0.5287387371063232 seconds +24: Time to load utils op: 0.6750218868255615 secondsTime to load utils op: 0.6750245094299316 seconds +24: +31: Time to load utils op: 0.5105757713317871 seconds +31: Time to load utils op: 0.46075940132141113 seconds +24: Time to load utils op: 0.6750340461730957 seconds +24: Time to load utils op: 0.45868706703186035 seconds +24: Time to load utils op: 0.45908093452453613 seconds +39: Time to load utils op: 0.3747086524963379 secondsTime to load utils op: 0.3746936321258545 seconds +39: +39: Time to load utils op: 0.3748641014099121 seconds +24: Time to load utils op: 0.4589710235595703 seconds +31: Time to load utils op: 0.4564511775970459 seconds +24: Time to load utils op: 0.4588742256164551 seconds +31: Time to load utils op: 0.45633935928344727 seconds +31: Time to load utils op: 0.4606590270996094 seconds +31: Time to load utils op: 0.4607253074645996 secondsTime to load utils op: 0.4565155506134033 seconds +31: +31: Time to load utils op: 0.4606504440307617 seconds +39: Time to load utils op: 0.5414919853210449 seconds +39: Time to load utils op: 0.5414884090423584 secondsTime to load utils op: 0.5415277481079102 seconds +39: +39: Time to load utils op: 0.5414924621582031 secondsTime to load utils op: 0.4319953918457031 seconds +39: + 0: Time to load utils op: 0.0005192756652832031 seconds + 0: Time to load utils op: 0.0005774497985839844 seconds + 0: Time to load utils op: 0.00045037269592285156 seconds + 0: Time to load utils op: 0.0005903244018554688 secondsTime to load utils op: 0.0006084442138671875 seconds + 0: Time to load utils op: 0.0005986690521240234 seconds + 0: + 0: Time to load utils op: 0.0006880760192871094 seconds +19: Time to load utils op: 0.0005080699920654297 seconds +19: Time to load utils op: 0.00042724609375 seconds +19: Time to load utils op: 0.0005180835723876953 seconds +19: Time to load utils op: 0.0004260540008544922 secondsTime to load utils op: 0.00041365623474121094 seconds +19: Time to load utils op: 0.00039839744567871094 seconds +19: +19: Time to load utils op: 0.0005323886871337891 secondsTime to load utils op: 0.0005428791046142578 seconds +19: +50: Time to load utils op: 0.0005981922149658203 seconds +50: Time to load utils op: 0.00038504600524902344 seconds +50: Time to load utils op: 0.0003898143768310547 seconds +50: Time to load utils op: 0.0004982948303222656 seconds +50: Time to load utils op: 0.0005414485931396484 seconds +50: Time to load utils op: 0.0004000663757324219 seconds +50: Time to load utils op: 0.000530242919921875 seconds +50: Time to load utils op: 0.00040721893310546875 seconds +44: Time to load utils op: 0.0005574226379394531 seconds +44: Time to load utils op: 0.0005536079406738281 seconds +44: Time to load utils op: 0.0005235671997070312 seconds +44: Time to load utils op: 0.0005640983581542969 seconds +44: Time to load utils op: 0.000579833984375 secondsTime to load utils op: 0.0005908012390136719 seconds +44: +44: Time to load utils op: 0.0006420612335205078 seconds +44: Time to load utils op: 0.0006337165832519531 seconds +15: Time to load utils op: 0.0009377002716064453 seconds +15: Time to load utils op: 0.0009126663208007812 seconds + 3: Time to load utils op: 0.0013165473937988281 seconds +32: Time to load utils op: 0.000545501708984375 seconds +32: Time to load utils op: 0.0004718303680419922 secondsTime to load utils op: 0.0004646778106689453 secondsTime to load utils op: 0.000469207763671875 secondsTime to load utils op: 0.00048661231994628906 secondsTime to load utils op: 0.0005004405975341797 secondsTime to load utils op: 0.0004832744598388672 seconds +32: +32: +32: +32: +32: +32: Time to load utils op: 0.00044465065002441406 seconds +15: Time to load utils op: 0.0013904571533203125 seconds +15: Time to load utils op: 0.0013527870178222656 seconds +15: Time to load utils op: 0.0014107227325439453 seconds +15: Time to load utils op: 0.0014095306396484375 secondsTime to load utils op: 0.0012633800506591797 seconds +15: +15: Time to load utils op: 0.001384735107421875 seconds + 3: Time to load utils op: 0.0016970634460449219 seconds + 3: Time to load utils op: 0.0016765594482421875 seconds + 3: Time to load utils op: 0.0017180442810058594 seconds + 3: Time to load utils op: 0.0017256736755371094 seconds + 3: Time to load utils op: 0.0016951560974121094 seconds + 3: Time to load utils op: 0.0016291141510009766 seconds +14: Time to load utils op: 0.0005052089691162109 seconds + 3: Time to load utils op: 0.0016846656799316406 seconds +14: Time to load utils op: 0.00042891502380371094 seconds +14: Time to load utils op: 0.0004191398620605469 seconds +43: Time to load utils op: 0.0005142688751220703 secondsTime to load utils op: 0.0005085468292236328 seconds +43: +43: Time to load utils op: 0.0005314350128173828 seconds +14: Time to load utils op: 0.0005195140838623047 seconds +14: Time to load utils op: 0.00041413307189941406 seconds +14: Time to load utils op: 0.0004222393035888672 seconds +14: Time to load utils op: 0.0004532337188720703 seconds +43: Time to load utils op: 0.0005741119384765625 seconds +14: Time to load utils op: 0.0005025863647460938 seconds +43: Time to load utils op: 0.0005624294281005859 seconds + 9: Time to load utils op: 0.0005002021789550781 seconds +17: Time to load utils op: 0.0005013942718505859 seconds + 9: Time to load utils op: 0.0004649162292480469 seconds +43: Time to load utils op: 0.0006244182586669922 seconds + 9: Time to load utils op: 0.00045943260192871094 seconds +17: Time to load utils op: 0.0004911422729492188 seconds +43: Time to load utils op: 0.0006649494171142578 secondsTime to load utils op: 0.0006415843963623047 seconds +43: + 9: Time to load utils op: 0.0004470348358154297 seconds + 9: Time to load utils op: 0.0004775524139404297 seconds + 9: Time to load utils op: 0.0004534721374511719 seconds +23: Time to load utils op: 0.0010704994201660156 seconds +17: Time to load utils op: 0.00042557716369628906 seconds + 9: Time to load utils op: 0.0004398822784423828 seconds +17: Time to load utils op: 0.0004425048828125 secondsTime to load utils op: 0.0004642009735107422 seconds +17: +17: Time to load utils op: 0.00041675567626953125 seconds + 9: Time to load utils op: 0.0004699230194091797 seconds +17: Time to load utils op: 0.0005233287811279297 seconds + 6: Time to load utils op: 0.0006444454193115234 secondsTime to load utils op: 0.0006062984466552734 seconds + 6: +17: Time to load utils op: 0.0005414485931396484 seconds + 6: Time to load utils op: 0.0006685256958007812 seconds + 6: Time to load utils op: 0.0006613731384277344 seconds + 6: Time to load utils op: 0.0006577968597412109 secondsTime to load utils op: 0.0006361007690429688 seconds + 6: + 6: Time to load utils op: 0.0006725788116455078 seconds + 6: Time to load utils op: 0.0007216930389404297 seconds +23: Time to load utils op: 0.0012843608856201172 seconds +23: Time to load utils op: 0.0012700557708740234 secondsTime to load utils op: 0.0012218952178955078 seconds +23: + 0: [2022-12-01 18:27:00,282] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 + 0: [2022-12-01 18:27:00,282] [INFO] [utils.py:828:see_memory_usage] MA 8.44 GB Max_MA 8.44 GB CA 10.57 GB Max_CA 11 GB + 0: [2022-12-01 18:27:00,282] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.93 GB, percent = 6.1% +48: Time to load utils op: 0.0005092620849609375 seconds +48: Time to load utils op: 0.0005171298980712891 seconds +48: Time to load utils op: 0.0005295276641845703 seconds +33: Time to load utils op: 0.0004906654357910156 seconds +48: Time to load utils op: 0.0005781650543212891 seconds +48: Time to load utils op: 0.0005962848663330078 seconds +33: Time to load utils op: 0.0005018711090087891 seconds +48: Time to load utils op: 0.0006167888641357422 seconds +33: Time to load utils op: 0.00046944618225097656 secondsTime to load utils op: 0.0004684925079345703 secondsTime to load utils op: 0.0004792213439941406 secondsTime to load utils op: 0.00046706199645996094 seconds +33: +33: Time to load utils op: 0.0004923343658447266 seconds +33: +33: +48: Time to load utils op: 0.0005834102630615234 seconds +49: Time to load utils op: 0.0005223751068115234 seconds +33: Time to load utils op: 0.0005474090576171875 seconds +48: Time to load utils op: 0.0006532669067382812 seconds +49: Time to load utils op: 0.0005595684051513672 seconds +49: Time to load utils op: 0.0005562305450439453 secondsTime to load utils op: 0.0005397796630859375 seconds +49: +49: Time to load utils op: 0.00044655799865722656 seconds +49: Time to load utils op: 0.0005691051483154297 seconds +49: Time to load utils op: 0.0005788803100585938 seconds +49: Time to load utils op: 0.0005538463592529297 seconds +37: Time to load utils op: 0.0005483627319335938 seconds +37: Time to load utils op: 0.0004911422729492188 seconds +37: Time to load utils op: 0.0004215240478515625 seconds +37: Time to load utils op: 0.0005807876586914062 seconds +37: Time to load utils op: 0.0005736351013183594 secondsTime to load utils op: 0.0005846023559570312 seconds +37: +37: Time to load utils op: 0.0007653236389160156 seconds +37: Time to load utils op: 0.0006792545318603516 seconds +63: Time to load utils op: 0.0005528926849365234 seconds +63: Time to load utils op: 0.0005698204040527344 seconds +63: Time to load utils op: 0.0005729198455810547 seconds +63: Time to load utils op: 0.0006330013275146484 seconds +63: Time to load utils op: 0.0005548000335693359 seconds +63: Time to load utils op: 0.0006048679351806641 seconds + 8: Time to load utils op: 0.0005068778991699219 seconds +63: Time to load utils op: 0.000614166259765625 seconds +63: Time to load utils op: 0.0006711483001708984 seconds + 8: Time to load utils op: 0.0005466938018798828 seconds +45: Time to load utils op: 0.0005249977111816406 secondsTime to load utils op: 0.0005300045013427734 seconds +45: +45: Time to load utils op: 0.0005340576171875 seconds + 8: Time to load utils op: 0.0004780292510986328 seconds + 8: Time to load utils op: 0.00043487548828125 secondsTime to load utils op: 0.0004317760467529297 secondsTime to load utils op: 0.00046753883361816406 seconds + 8: + 8: +41: Time to load utils op: 0.0005240440368652344 seconds +41: Time to load utils op: 0.00043129920959472656 seconds + 8: Time to load utils op: 0.0005633831024169922 seconds +41: Time to load utils op: 0.0005338191986083984 seconds + 8: Time to load utils op: 0.0004191398620605469 seconds +41: Time to load utils op: 0.0004398822784423828 seconds +45: Time to load utils op: 0.0005915164947509766 secondsTime to load utils op: 0.0006000995635986328 secondsTime to load utils op: 0.0005898475646972656 seconds +45: +45: +35: Time to load utils op: 0.000385284423828125 seconds +35: Time to load utils op: 0.0005002021789550781 seconds +45: Time to load utils op: 0.0006673336029052734 seconds +35: Time to load utils op: 0.00044727325439453125 seconds +41: Time to load utils op: 0.00045990943908691406 seconds +41: Time to load utils op: 0.0004448890686035156 seconds +45: Time to load utils op: 0.0006287097930908203 seconds +30: Time to load utils op: 0.0004794597625732422 seconds +12: Time to load utils op: 0.0005135536193847656 seconds +35: Time to load utils op: 0.0005586147308349609 seconds +12: Time to load utils op: 0.0005173683166503906 seconds +41: Time to load utils op: 0.0005192756652832031 seconds +35: Time to load utils op: 0.0004010200500488281 seconds +30: Time to load utils op: 0.00042629241943359375 seconds +41: Time to load utils op: 0.0005636215209960938 seconds +30: Time to load utils op: 0.0004513263702392578 secondsTime to load utils op: 0.0003941059112548828 seconds +30: +35: Time to load utils op: 0.0005357265472412109 seconds +35: Time to load utils op: 0.0005347728729248047 seconds +54: Time to load utils op: 0.0008728504180908203 seconds +12: Time to load utils op: 0.0005626678466796875 seconds +35: Time to load utils op: 0.0005555152893066406 seconds +12: Time to load utils op: 0.0006089210510253906 secondsTime to load utils op: 0.0005812644958496094 seconds +12: +30: Time to load utils op: 0.0005931854248046875 secondsTime to load utils op: 0.0005707740783691406 seconds +30: +12: Time to load utils op: 0.0005922317504882812 seconds +12: Time to load utils op: 0.0005934238433837891 seconds + 5: Time to load utils op: 0.0005750656127929688 seconds +12: Time to load utils op: 0.0005822181701660156 seconds + 5: Time to load utils op: 0.0005691051483154297 seconds +30: Time to load utils op: 0.0006246566772460938 seconds + 5: Time to load utils op: 0.0005636215209960938 secondsTime to load utils op: 0.0005502700805664062 seconds + 5: +30: Time to load utils op: 0.000640869140625 seconds + 5: Time to load utils op: 0.0005927085876464844 seconds +38: Time to load utils op: 0.0005342960357666016 seconds +38: Time to load utils op: 0.0005345344543457031 seconds +54: Time to load utils op: 0.0011279582977294922 seconds + 5: Time to load utils op: 0.0006072521209716797 seconds +38: Time to load utils op: 0.000560760498046875 seconds +38: Time to load utils op: 0.0005502700805664062 seconds + 5: Time to load utils op: 0.0005574226379394531 seconds + 5: Time to load utils op: 0.0005862712860107422 seconds +38: Time to load utils op: 0.00047659873962402344 seconds +38: Time to load utils op: 0.0005483627319335938 seconds +38: Time to load utils op: 0.000560760498046875 seconds +54: Time to load utils op: 0.0012497901916503906 seconds +38: Time to load utils op: 0.0005724430084228516 seconds +54: Time to load utils op: 0.0012357234954833984 seconds +59: Time to load utils op: 0.0005862712860107422 seconds +54: Time to load utils op: 0.0012552738189697266 seconds +42: Time to load utils op: 0.0005309581756591797 seconds +54: Time to load utils op: 0.0012755393981933594 seconds +54: Time to load utils op: 0.0012559890747070312 seconds +42: Time to load utils op: 0.0005643367767333984 secondsTime to load utils op: 0.000568389892578125 seconds +42: +54: Time to load utils op: 0.0012545585632324219 seconds +59: Time to load utils op: 0.0006153583526611328 secondsTime to load utils op: 0.0006265640258789062 seconds +59: +59: Time to load utils op: 0.0005977153778076172 seconds +42: Time to load utils op: 0.0006186962127685547 secondsTime to load utils op: 0.0005724430084228516 seconds +42: +59: Time to load utils op: 0.0006341934204101562 seconds +59: Time to load utils op: 0.0006318092346191406 seconds +59: Time to load utils op: 0.0006420612335205078 seconds +42: Time to load utils op: 0.0006785392761230469 seconds +59: Time to load utils op: 0.0006911754608154297 seconds +42: Time to load utils op: 0.0007371902465820312 seconds +42: Time to load utils op: 0.0006949901580810547 seconds +55: Time to load utils op: 0.0005180835723876953 seconds +55: Time to load utils op: 0.0003910064697265625 seconds +55: Time to load utils op: 0.00048661231994628906 seconds +51: Time to load utils op: 0.00048732757568359375 seconds +10: Time to load utils op: 0.0004029273986816406 seconds +10: Time to load utils op: 0.00040030479431152344 seconds +10: Time to load utils op: 0.0005357265472412109 seconds +51: Time to load utils op: 0.0004425048828125 secondsTime to load utils op: 0.00041937828063964844 seconds +27: Time to load utils op: 0.0005142688751220703 seconds +10: Time to load utils op: 0.000583648681640625 seconds +51: +55: Time to load utils op: 0.0004286766052246094 seconds +10: Time to load utils op: 0.0004146099090576172 secondsTime to load utils op: 0.00043964385986328125 seconds +10: +10: Time to load utils op: 0.0004038810729980469 seconds +18: Time to load utils op: 0.00048089027404785156 seconds +51: Time to load utils op: 0.0004551410675048828 seconds +51: Time to load utils op: 0.00043582916259765625 secondsTime to load utils op: 0.0005626678466796875 secondsTime to load utils op: 0.0005035400390625 seconds +10: Time to load utils op: 0.0004038810729980469 seconds + 1: Time to load utils op: 0.0005259513854980469 secondsTime to load utils op: 0.0005507469177246094 seconds + 1: +25: Time to load utils op: 0.00048542022705078125 seconds +55: Time to load utils op: 0.0003952980041503906 seconds +18: Time to load utils op: 0.00040602684020996094 seconds +51: +51: +25: Time to load utils op: 0.0004138946533203125 seconds +34: Time to load utils op: 0.0004899501800537109 seconds +55: Time to load utils op: 0.0004360675811767578 seconds +52: Time to load utils op: 0.0006124973297119141 seconds + 4: Time to load utils op: 0.0005316734313964844 seconds +51: Time to load utils op: 0.0004475116729736328 seconds + 1: Time to load utils op: 0.0005946159362792969 secondsTime to load utils op: 0.0005693435668945312 seconds + 1: +27: Time to load utils op: 0.0005278587341308594 secondsTime to load utils op: 0.0005788803100585938 seconds +27: +55: Time to load utils op: 0.0004317760467529297 seconds +55: Time to load utils op: 0.0004227161407470703 seconds +18: Time to load utils op: 0.0005052089691162109 seconds +16: Time to load utils op: 0.0004818439483642578 seconds +11: Time to load utils op: 0.0006985664367675781 seconds +27: Time to load utils op: 0.0005564689636230469 seconds +46: Time to load utils op: 0.0007560253143310547 seconds +11: Time to load utils op: 0.0006697177886962891 seconds +27: Time to load utils op: 0.0005719661712646484 seconds +34: Time to load utils op: 0.00041103363037109375 seconds +46: Time to load utils op: 0.0006976127624511719 seconds +52: Time to load utils op: 0.0004918575286865234 seconds + 7: Time to load utils op: 0.0005116462707519531 seconds +11: Time to load utils op: 0.0006549358367919922 secondsTime to load utils op: 0.0006699562072753906 seconds +11: +34: Time to load utils op: 0.0004410743713378906 seconds +52: Time to load utils op: 0.0006086826324462891 seconds +16: Time to load utils op: 0.00042557716369628906 secondsTime to load utils op: 0.0004448890686035156 secondsTime to load utils op: 0.0004520416259765625 seconds +16: +16: +18: Time to load utils op: 0.0005550384521484375 seconds + 2: Time to load utils op: 0.0005228519439697266 seconds +16: Time to load utils op: 0.0005559921264648438 seconds + 4: Time to load utils op: 0.0006978511810302734 secondsTime to load utils op: 0.0006506443023681641 seconds + 4: + 1: Time to load utils op: 0.0006349086761474609 seconds +25: Time to load utils op: 0.0006084442138671875 seconds +20: Time to load utils op: 0.0006725788116455078 seconds +18: Time to load utils op: 0.0005879402160644531 seconds +52: Time to load utils op: 0.0006508827209472656 seconds + 4: Time to load utils op: 0.0006721019744873047 seconds + 1: Time to load utils op: 0.0006232261657714844 secondsTime to load utils op: 0.0006043910980224609 seconds + 1: + 7: Time to load utils op: 0.00045943260192871094 secondsTime to load utils op: 0.0004417896270751953 seconds + 7: + 7: Time to load utils op: 0.0004508495330810547 seconds +27: Time to load utils op: 0.0006759166717529297 seconds +25: Time to load utils op: 0.0006020069122314453 seconds +36: Time to load utils op: 0.0006082057952880859 secondsTime to load utils op: 0.0006852149963378906 seconds +36: +36: Time to load utils op: 0.0006954669952392578 secondsTime to load utils op: 0.0006742477416992188 secondsTime to load utils op: 0.0006487369537353516 seconds +36: +36: +20: Time to load utils op: 0.0005793571472167969 secondsTime to load utils op: 0.000621795654296875 secondsTime to load utils op: 0.0006175041198730469 seconds +20: +20: + 2: Time to load utils op: 0.0005257129669189453 seconds + 4: Time to load utils op: 0.0006916522979736328 seconds + 1: Time to load utils op: 0.0006551742553710938 seconds + 7: Time to load utils op: 0.0004742145538330078 secondsTime to load utils op: 0.0004467964172363281 seconds + 7: +11: Time to load utils op: 0.0006608963012695312 seconds +27: Time to load utils op: 0.0006732940673828125 secondsTime to load utils op: 0.0006861686706542969 seconds +27: +25: Time to load utils op: 0.0006070137023925781 seconds +36: Time to load utils op: 0.0006878376007080078 seconds +26: Time to load utils op: 0.0005786418914794922 seconds +52: Time to load utils op: 0.0006518363952636719 secondsTime to load utils op: 0.0006642341613769531 seconds +52: + 7: Time to load utils op: 0.00047588348388671875 seconds +25: Time to load utils op: 0.0006306171417236328 seconds +36: Time to load utils op: 0.0004897117614746094 seconds +46: Time to load utils op: 0.00083160400390625 seconds +26: Time to load utils op: 0.0005865097045898438 secondsTime to load utils op: 0.0005905628204345703 seconds +26: +52: Time to load utils op: 0.0006439685821533203 seconds + 2: Time to load utils op: 0.0006320476531982422 secondsTime to load utils op: 0.0006451606750488281 seconds + 2: +16: Time to load utils op: 0.0005638599395751953 seconds + 4: Time to load utils op: 0.0006749629974365234 seconds + 7: Time to load utils op: 0.00042700767517089844 seconds +11: Time to load utils op: 0.0006890296936035156 secondsTime to load utils op: 0.0006885528564453125 seconds +11: +46: Time to load utils op: 0.0008914470672607422 secondsTime to load utils op: 0.0009133815765380859 seconds +46: +18: Time to load utils op: 0.0007293224334716797 seconds +52: Time to load utils op: 0.0006968975067138672 seconds +16: Time to load utils op: 0.0006151199340820312 seconds + 4: Time to load utils op: 0.0007686614990234375 seconds +11: Time to load utils op: 0.0007121562957763672 seconds +36: Time to load utils op: 0.000629425048828125 seconds +46: Time to load utils op: 0.0008823871612548828 seconds +20: Time to load utils op: 0.0010085105895996094 seconds +18: Time to load utils op: 0.0007081031799316406 seconds +53: Time to load utils op: 0.0005342960357666016 seconds + 4: Time to load utils op: 0.0007352828979492188 seconds +46: Time to load utils op: 0.0008394718170166016 seconds +46: Time to load utils op: 0.0008223056793212891 seconds +18: Time to load utils op: 0.0007526874542236328 seconds +16: Time to load utils op: 0.000614166259765625 seconds +25: Time to load utils op: 0.0007152557373046875 seconds +34: Time to load utils op: 0.0005910396575927734 seconds +25: Time to load utils op: 0.0007097721099853516 seconds + 2: Time to load utils op: 0.0006718635559082031 seconds +53: Time to load utils op: 0.0005807876586914062 seconds +34: Time to load utils op: 0.0005908012390136719 seconds + 2: Time to load utils op: 0.0007658004760742188 seconds +53: Time to load utils op: 0.0004584789276123047 seconds +34: Time to load utils op: 0.0006103515625 secondsTime to load utils op: 0.0005996227264404297 seconds +34: +26: Time to load utils op: 0.0007193088531494141 seconds +20: Time to load utils op: 0.0011363029479980469 seconds + 2: Time to load utils op: 0.0007643699645996094 seconds +34: Time to load utils op: 0.0004611015319824219 seconds +26: Time to load utils op: 0.0007519721984863281 seconds +20: Time to load utils op: 0.001108407974243164 seconds +53: Time to load utils op: 0.0005657672882080078 seconds +26: Time to load utils op: 0.0006575584411621094 seconds +26: Time to load utils op: 0.0007371902465820312 seconds + 2: Time to load utils op: 0.0008795261383056641 seconds +26: Time to load utils op: 0.0004756450653076172 seconds +20: Time to load utils op: 0.0012192726135253906 seconds +53: Time to load utils op: 0.0005652904510498047 seconds +53: Time to load utils op: 0.0006296634674072266 seconds +53: Time to load utils op: 0.0006504058837890625 seconds +53: Time to load utils op: 0.0006444454193115234 seconds +23: Time to load utils op: 0.0012462139129638672 seconds +23: Time to load utils op: 0.0012326240539550781 seconds +23: Time to load utils op: 0.0012204647064208984 seconds +23: Time to load utils op: 0.0013191699981689453 seconds +31: Time to load utils op: 0.0005118846893310547 seconds +31: Time to load utils op: 0.0005955696105957031 seconds +31: Time to load utils op: 0.0005822181701660156 seconds +31: Time to load utils op: 0.0006165504455566406 secondsTime to load utils op: 0.0006191730499267578 seconds +31: +31: Time to load utils op: 0.0006279945373535156 seconds +31: Time to load utils op: 0.0005824565887451172 secondsTime to load utils op: 0.0005855560302734375 seconds +31: +62: Time to load utils op: 0.0004990100860595703 seconds +13: Time to load utils op: 0.0004730224609375 seconds +21: Time to load utils op: 0.0005059242248535156 seconds +62: Time to load utils op: 0.0004401206970214844 seconds +57: Time to load utils op: 0.0004742145538330078 seconds +62: Time to load utils op: 0.0004992485046386719 seconds +13: Time to load utils op: 0.0005159378051757812 seconds +13: Time to load utils op: 0.00041866302490234375 secondsTime to load utils op: 0.0004222393035888672 seconds +13: +62: Time to load utils op: 0.0004546642303466797 seconds +62: Time to load utils op: 0.0004582405090332031 seconds +13: Time to load utils op: 0.00042128562927246094 seconds +21: Time to load utils op: 0.0005707740783691406 seconds +57: Time to load utils op: 0.0004515647888183594 seconds +62: Time to load utils op: 0.0004551410675048828 seconds +21: Time to load utils op: 0.0005915164947509766 seconds +62: Time to load utils op: 0.000537872314453125 seconds +62: Time to load utils op: 0.0005366802215576172 seconds +13: Time to load utils op: 0.0005552768707275391 seconds +21: Time to load utils op: 0.0006034374237060547 seconds +13: Time to load utils op: 0.0005595684051513672 seconds +24: Time to load utils op: 0.0007686614990234375 secondsTime to load utils op: 0.0007390975952148438 seconds +24: +21: Time to load utils op: 0.0006191730499267578 seconds +21: Time to load utils op: 0.0006253719329833984 seconds +57: Time to load utils op: 0.00044989585876464844 secondsTime to load utils op: 0.00046062469482421875 seconds +57: Time to load utils op: 0.0004372596740722656 seconds +57: Time to load utils op: 0.0004811286926269531 secondsTime to load utils op: 0.0004639625549316406 seconds +57: +57: +57: Time to load utils op: 0.0004458427429199219 seconds +21: Time to load utils op: 0.0007901191711425781 seconds +21: Time to load utils op: 0.0007655620574951172 seconds +47: Time to load utils op: 0.0003705024719238281 seconds +47: Time to load utils op: 0.0004775524139404297 seconds +24: Time to load utils op: 0.001299142837524414 seconds +24: Time to load utils op: 0.0011870861053466797 seconds +47: Time to load utils op: 0.0004100799560546875 seconds +29: Time to load utils op: 0.0004696846008300781 seconds +24: Time to load utils op: 0.0012218952178955078 seconds +24: Time to load utils op: 0.001211404800415039 seconds +47: Time to load utils op: 0.0004975795745849609 seconds +47: Time to load utils op: 0.0004658699035644531 seconds +47: Time to load utils op: 0.0004718303680419922 seconds +22: Time to load utils op: 0.0004603862762451172 seconds +24: Time to load utils op: 0.0012009143829345703 seconds +47: Time to load utils op: 0.00040078163146972656 seconds +39: Time to load utils op: 0.0005354881286621094 seconds +22: Time to load utils op: 0.0004210472106933594 seconds +24: Time to load utils op: 0.0012001991271972656 seconds +47: Time to load utils op: 0.0005519390106201172 seconds +29: Time to load utils op: 0.00047969818115234375 seconds +22: Time to load utils op: 0.00042438507080078125 secondsTime to load utils op: 0.0004181861877441406 seconds +22: +29: Time to load utils op: 0.00042748451232910156 secondsTime to load utils op: 0.00044536590576171875 seconds +29: +39: Time to load utils op: 0.0005338191986083984 secondsTime to load utils op: 0.0005180835723876953 seconds +39: +40: Time to load utils op: 0.0005419254302978516 seconds +40: Time to load utils op: 0.0005567073822021484 seconds +29: Time to load utils op: 0.00044155120849609375 secondsTime to load utils op: 0.0004374980926513672 secondsTime to load utils op: 0.0004062652587890625 seconds +29: +29: +29: Time to load utils op: 0.00044226646423339844 seconds +39: Time to load utils op: 0.0005371570587158203 seconds +22: Time to load utils op: 0.0005314350128173828 seconds +22: Time to load utils op: 0.0005166530609130859 seconds +22: Time to load utils op: 0.0005555152893066406 seconds +58: Time to load utils op: 0.0004966259002685547 seconds +58: Time to load utils op: 0.0005366802215576172 secondsTime to load utils op: 0.0005228519439697266 seconds +58: +39: Time to load utils op: 0.0006358623504638672 seconds +39: Time to load utils op: 0.0006604194641113281 seconds +40: Time to load utils op: 0.0006718635559082031 secondsTime to load utils op: 0.0006299018859863281 seconds +40: +39: Time to load utils op: 0.0006968975067138672 seconds +22: Time to load utils op: 0.0005710124969482422 seconds +40: Time to load utils op: 0.0006864070892333984 seconds +39: Time to load utils op: 0.0006930828094482422 seconds +40: Time to load utils op: 0.0006830692291259766 seconds +58: Time to load utils op: 0.0005846023559570312 seconds +58: Time to load utils op: 0.0006093978881835938 seconds +40: Time to load utils op: 0.0007147789001464844 secondsTime to load utils op: 0.0007281303405761719 seconds +40: +58: Time to load utils op: 0.00063323974609375 seconds +58: Time to load utils op: 0.000583648681640625 secondsTime to load utils op: 0.0006442070007324219 seconds +58: +60: Time to load utils op: 0.0005218982696533203 seconds +60: Time to load utils op: 0.0004987716674804688 seconds +28: Time to load utils op: 0.0005419254302978516 seconds +28: Time to load utils op: 0.0005643367767333984 seconds +61: Time to load utils op: 0.0004904270172119141 seconds +61: Time to load utils op: 0.0005064010620117188 seconds +61: Time to load utils op: 0.0004515647888183594 seconds +28: Time to load utils op: 0.0005953311920166016 seconds +28: Time to load utils op: 0.0006060600280761719 seconds +60: Time to load utils op: 0.0005745887756347656 seconds +28: Time to load utils op: 0.0005986690521240234 seconds +60: Time to load utils op: 0.0005769729614257812 seconds +60: Time to load utils op: 0.0005736351013183594 seconds +60: Time to load utils op: 0.0005922317504882812 seconds +28: Time to load utils op: 0.0005872249603271484 seconds +61: Time to load utils op: 0.0005955696105957031 seconds +60: Time to load utils op: 0.0005698204040527344 seconds +28: Time to load utils op: 0.0005660057067871094 seconds +61: Time to load utils op: 0.0005917549133300781 seconds +60: Time to load utils op: 0.0006322860717773438 seconds +28: Time to load utils op: 0.0006544589996337891 seconds +61: Time to load utils op: 0.0006420612335205078 secondsTime to load utils op: 0.0006470680236816406 seconds +61: +61: Time to load utils op: 0.0006816387176513672 seconds +56: Time to load utils op: 0.0004837512969970703 seconds +56: Time to load utils op: 0.00037384033203125 seconds +56: Time to load utils op: 0.00044226646423339844 secondsTime to load utils op: 0.00046062469482421875 secondsTime to load utils op: 0.00045943260192871094 seconds +56: +56: +56: Time to load utils op: 0.0005691051483154297 seconds +56: Time to load utils op: 0.00039839744567871094 seconds +56: Time to load utils op: 0.0006198883056640625 seconds + 0: [2022-12-01 18:27:00,334] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 + 0: [2022-12-01 18:27:00,334] [INFO] [utils.py:828:see_memory_usage] MA 8.44 GB Max_MA 8.44 GB CA 10.57 GB Max_CA 11 GB + 0: [2022-12-01 18:27:00,335] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.1 GB, percent = 6.2% +13: Time to load utils op: 0.0006015300750732422 seconds + 0: [2022-12-01 18:27:00,385] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 + 0: [2022-12-01 18:27:00,386] [INFO] [utils.py:828:see_memory_usage] MA 12.5 GB Max_MA 12.5 GB CA 16.62 GB Max_CA 17 GB + 0: [2022-12-01 18:27:00,386] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.19 GB, percent = 6.2% + 0: [2022-12-01 18:27:00,427] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 + 0: [2022-12-01 18:27:00,428] [INFO] [utils.py:828:see_memory_usage] MA 12.5 GB Max_MA 12.5 GB CA 16.62 GB Max_CA 17 GB + 0: [2022-12-01 18:27:00,428] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.21 GB, percent = 6.2% + 0: [2022-12-01 18:27:00,481] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 + 0: [2022-12-01 18:27:00,482] [INFO] [utils.py:828:see_memory_usage] MA 12.51 GB Max_MA 12.51 GB CA 16.62 GB Max_CA 17 GB + 0: [2022-12-01 18:27:00,482] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% + 0: [2022-12-01 18:27:00,525] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer + 0: [2022-12-01 18:27:00,525] [INFO] [utils.py:828:see_memory_usage] MA 12.51 GB Max_MA 12.51 GB CA 16.62 GB Max_CA 17 GB + 0: [2022-12-01 18:27:00,526] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.47 GB, percent = 6.3% + 0: [2022-12-01 18:27:00,567] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer + 0: [2022-12-01 18:27:00,568] [INFO] [utils.py:828:see_memory_usage] MA 12.64 GB Max_MA 12.64 GB CA 16.62 GB Max_CA 17 GB + 0: [2022-12-01 18:27:00,568] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.47 GB, percent = 6.3% + 0: [2022-12-01 18:27:00,602] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer + 0: [2022-12-01 18:27:00,602] [INFO] [utils.py:828:see_memory_usage] MA 12.64 GB Max_MA 12.64 GB CA 16.62 GB Max_CA 17 GB + 0: [2022-12-01 18:27:00,602] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.47 GB, percent = 6.3% + 0: [2022-12-01 18:27:00,602] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam + 0: [2022-12-01 18:27:00,603] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler + 0: [2022-12-01 18:27:00,603] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = + 0: [2022-12-01 18:27:00,603] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] + 0: [2022-12-01 18:27:00,603] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: + 0: [2022-12-01 18:27:00,603] [INFO] [config.py:1011:print] activation_checkpointing_config { + 0: "partition_activations": false, + 0: "contiguous_memory_optimization": false, + 0: "cpu_checkpointing": false, + 0: "number_checkpoints": null, + 0: "synchronize_checkpoint_boundary": false, + 0: "profile": false + 0: } + 0: [2022-12-01 18:27:00,603] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} + 0: [2022-12-01 18:27:00,603] [INFO] [config.py:1011:print] amp_enabled .................. False + 0: [2022-12-01 18:27:00,603] [INFO] [config.py:1011:print] amp_params ................... False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] autotuning_config ............ { + 0: "enabled": false, + 0: "start_step": null, + 0: "end_step": null, + 0: "metric_path": null, + 0: "arg_mappings": null, + 0: "metric": "throughput", + 0: "model_info": null, + 0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", + 0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", + 0: "overwrite": true, + 0: "fast": true, + 0: "start_profile_step": 3, + 0: "end_profile_step": 5, + 0: "tuner_type": "gridsearch", + 0: "tuner_early_stopping": 5, + 0: "tuner_num_trials": 50, + 0: "model_info_path": null, + 0: "mp_size": 1, + 0: "max_train_batch_size": null, + 0: "min_train_batch_size": 1, + 0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, + 0: "min_train_micro_batch_size_per_gpu": 1, + 0: "num_tuning_micro_batch_sizes": 3 + 0: } + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] bfloat16_enabled ............. True + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] comms_config ................. + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] communication_data_type ...... None + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa + 0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] curriculum_enabled ........... False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] curriculum_params ............ False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] dataloader_drop_last ......... False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] disable_allgather ............ False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] dump_state ................... False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] elasticity_enabled ........... False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] flops_profiler_config ........ { + 0: "enabled": false, + 0: "profile_step": 1, + 0: "module_depth": -1, + 0: "top_modules": 1, + 0: "detailed": true, + 0: "output_file": null + 0: } + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] fp16_auto_cast ............... None + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] fp16_enabled ................. False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] global_rank .................. 0 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 4 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] load_universal_checkpoint .... False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] loss_scale ................... 1.0 + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] memory_breakdown ............. False + 0: [2022-12-01 18:27:00,604] [INFO] [config.py:1011:print] monitor_config ............... + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] nebula_config ................ { + 0: "enabled": false, + 0: "persistent_storage_path": null, + 0: "persistent_time_interval": 100, + 0: "num_of_version_in_retention": 2, + 0: "enable_nebula_load": true, + 0: "load_path": null + 0: } + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] optimizer_name ............... None + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] optimizer_params ............. None + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] pld_enabled .................. False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] pld_params ................... False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] prescale_gradients ........... False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] scheduler_name ............... None + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] scheduler_params ............. None + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] sparse_attention ............. None + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] steps_per_print .............. 2000 + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] train_batch_size ............. 1024 + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 2 + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] use_node_local_storage ....... False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] world_size ................... 128 + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] zero_enabled ................. False + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 + 0: [2022-12-01 18:27:00,605] [INFO] [config.py:996:print_user_config] json = { + 0: "train_micro_batch_size_per_gpu": 2, + 0: "train_batch_size": 1.024000e+03, + 0: "gradient_clipping": 1.0, + 0: "zero_optimization": { + 0: "stage": 0 + 0: }, + 0: "bf16": { + 0: "enabled": true + 0: }, + 0: "steps_per_print": 2.000000e+03, + 0: "wall_clock_breakdown": false + 0: } + 0: Time to load utils op: 0.000396728515625 seconds + 0: [2022-12-01 18:27:00,606] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=4 micro_batch_size=2 +32: [2022-12-01 18:27:01,098] [INFO] [engine.py:145:__init__] RANK=256 STAGE=1 LAYERS=25 [24, 49) STAGE_PARAMS=2226427904 (2226.428M) TOTAL_PARAMS=8905695232 (8905.695M) UNIQUE_PARAMS=8682348544 (8682.349M) +32: [2022-12-01 18:27:01,098] [INFO] [engine.py:145:__init__] RANK=257 STAGE=1 LAYERS=25 [24, 49) STAGE_PARAMS=2226427904 (2226.428M) TOTAL_PARAMS=8905695232 (8905.695M) UNIQUE_PARAMS=8682348544 (8682.349M) + 0: [2022-12-01 18:27:01,098] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=24 [0, 24) STAGE_PARAMS=2226419712 (2226.420M) TOTAL_PARAMS=8905695232 (8905.695M) UNIQUE_PARAMS=8682348544 (8682.349M) + 0: [2022-12-01 18:27:01,098] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=24 [0, 24) STAGE_PARAMS=2226419712 (2226.420M) TOTAL_PARAMS=8905695232 (8905.695M) UNIQUE_PARAMS=8682348544 (8682.349M) + 0: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 0: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 0: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 0: WARNING: could not find the metadata file checkpoints_8b7beta + 0: will not load any checkpoints and will start from random + 0: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +32: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +48: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +56: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 0: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +60: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 0: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +62: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +63: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +63: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +63: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +63: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +48: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +54: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +44: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +32: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +52: [2022-12-01 18:27:03,773] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +24: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +28: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +46: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +55: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +48: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +16: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +56: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +62: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +60: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +58: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +31: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +47: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +47: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +47: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +40: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +63: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +61: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +59: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +30: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +53: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +50: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +63: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +52: [2022-12-01 18:27:03,773] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +38: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 0: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 0: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +29: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +42: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +61: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +47: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +12: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +26: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +26: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +26: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +47: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +24: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +45: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +54: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +27: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +43: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +26: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +41: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 8: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +40: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +62: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +60: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +39: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +11: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +14: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +44: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +28: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +32: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +46: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +20: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +47: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +48: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +53: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +30: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +23: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +16: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +22: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +36: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +36: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +58: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +34: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +50: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +26: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +15: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +15: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +19: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +19: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +51: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +63: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +57: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +25: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +36: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +36: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +36: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +31: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +15: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +15: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +19: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +19: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 4: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +36: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +26: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +15: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +15: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +35: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +63: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 7: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +13: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +37: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +37: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +37: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +59: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +27: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +43: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +37: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +33: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +33: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +33: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 6: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +12: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +28: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +37: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +37: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +47: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 2: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +19: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +19: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +53: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +60: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +42: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +46: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +49: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +49: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +10: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 8: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +52: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +45: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 3: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +40: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +56: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +17: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +25: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +38: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +32: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +50: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +49: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 5: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +20: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +41: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +21: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +61: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +47: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 9: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +29: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +23: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +54: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +51: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +14: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +24: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +26: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +26: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +49: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +49: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +18: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +15: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +30: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +57: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +36: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +49: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +33: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +33: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +15: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +12: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +11: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +36: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +58: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +48: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +62: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +44: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 8: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +33: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +45: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +59: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +35: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +16: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +39: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +13: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +34: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +49: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +55: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +37: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 6: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +53: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +30: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 7: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +27: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +42: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +28: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +50: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +52: [2022-12-01 18:27:03,774] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +60: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +25: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +49: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +10: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +37: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +31: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 1: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +22: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 5: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +19: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +19: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +20: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +56: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +41: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +57: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +14: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +38: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +34: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +32: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +33: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +33: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +61: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 9: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +12: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +29: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +23: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +51: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +17: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +43: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +44: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +28: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +55: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +45: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +53: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +40: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 4: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +27: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +46: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +21: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 6: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 3: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 5: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +30: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +54: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 7: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +25: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +62: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +16: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +60: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +50: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +12: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 2: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +42: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +52: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +59: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +35: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +51: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +11: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +43: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +41: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 8: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +48: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +39: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +14: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +24: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +44: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +28: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +32: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +18: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +31: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 3: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 9: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +40: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 4: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 1: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +13: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +58: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +34: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +46: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +55: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +61: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +45: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +29: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +23: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +22: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +10: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 6: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +56: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +17: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +25: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +50: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +54: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +21: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +30: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +60: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +38: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +41: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +52: [2022-12-01 18:27:03,775] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +16: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +48: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 5: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +53: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +43: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +58: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +12: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 7: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +22: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +32: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +61: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +11: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +57: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +34: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +55: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +23: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +51: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +44: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 8: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +31: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +45: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 2: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +40: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 1: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +27: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +13: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +20: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +30: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +24: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 5: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +35: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 4: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +42: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +28: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +10: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +29: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +60: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +39: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +25: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +50: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +41: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 6: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 3: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 9: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +53: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +17: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +59: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +12: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +16: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 7: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +11: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +43: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +18: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +54: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +62: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +14: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +46: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +45: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +48: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +34: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 2: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 8: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +44: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +52: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +58: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +61: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +40: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +23: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +56: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +51: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +57: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +27: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +32: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +20: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 9: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +53: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 4: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 1: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +43: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +25: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +38: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +28: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +50: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +55: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 6: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +17: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +11: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +13: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +24: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +42: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +46: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 5: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +21: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +31: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 3: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +12: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +62: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +41: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +34: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 2: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +35: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +54: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 9: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +59: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +16: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 7: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +11: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 5: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 8: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +52: [2022-12-01 18:27:03,776] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +30: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +51: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +39: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +27: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +58: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +55: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +18: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +29: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +40: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +43: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +25: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +14: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +44: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 6: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +61: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +45: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +23: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +54: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +57: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +46: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +31: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 2: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +56: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +62: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +21: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 3: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 4: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 1: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 7: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +38: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +42: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +24: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 5: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +41: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +16: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +34: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +20: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +51: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +17: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +11: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +14: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +55: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 6: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +39: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +13: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 8: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 9: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 2: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +56: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +27: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +58: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +20: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +31: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +59: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +29: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +35: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 4: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 1: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 7: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +13: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +38: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +22: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +42: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +18: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +21: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +23: [2022-12-01 18:27:03,777] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +17: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +39: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +57: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +14: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +24: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 3: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 9: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +29: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +18: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 2: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +10: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 3: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +59: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +35: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +39: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +22: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +20: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 4: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +17: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +38: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +10: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 1: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +57: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +13: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +18: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +21: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +35: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. + 1: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +10: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +21: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +22: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +18: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +10: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +22: [2022-12-01 18:27:03,778] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7beta/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +63: time (ms) | load-checkpoint: 13.02 + 0: estimated model parameters: 8.905678848 + 0: estimated model parameters without embeddings: 8.458985472 + 0: [after model, optimizer, and learning rate scheduler are built] datetime: 2022-12-01 18:27:05 + 0: > building train, validation, and test datasets ... + 0: > datasets target sizes (minimum size): + 0: train: 5625981 + 0: validation: 6144 + 0: test: 1024 + 0: > building train, validation, and test datasets for GPT ... + 0: > building dataset index ... + 0: reading sizes... + 0: reading pointers... + 0: reading document index... + 0: creating numpy buffer of mmap... + 0: creating memory view of numpy buffer... + 0: > finished creating indexed dataset in 0.058187 seconds + 0: number of documents: 210604984 + 0: > dataset split: + 0: train: + 0: document indices in [0, 199864130) total of 199864130 documents + 0: validation: + 0: document indices in [199864130, 210394379) total of 10530249 documents + 0: test: + 0: document indices in [210394379, 210604984) total of 210605 documents + 0: > loading doc-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_train_indexmap_5625981ns_2048sl_1234s_doc_idx.npy + 0: > loading sample-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_train_indexmap_5625981ns_2048sl_1234s_sample_idx.npy + 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_train_indexmap_5625981ns_2048sl_1234s_shuffle_idx.npy + 0: loaded indexed file in 0.143 seconds + 0: total number of samples: 173377817 + 0: total number of epochs: 1 + 0: > loading doc-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_valid_indexmap_6144ns_2048sl_1234s_doc_idx.npy + 0: > loading sample-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_valid_indexmap_6144ns_2048sl_1234s_sample_idx.npy + 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_valid_indexmap_6144ns_2048sl_1234s_shuffle_idx.npy + 0: loaded indexed file in 0.074 seconds + 0: total number of samples: 9118345 + 0: total number of epochs: 1 + 0: > loading doc-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_test_indexmap_1024ns_2048sl_1234s_doc_idx.npy + 0: > loading sample-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_test_indexmap_1024ns_2048sl_1234s_sample_idx.npy + 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_test_indexmap_1024ns_2048sl_1234s_shuffle_idx.npy + 0: loaded indexed file in 0.074 seconds + 0: total number of samples: 182928 + 0: total number of epochs: 1 + 0: > finished creating GPT datasets ... + 0: [after dataloaders are built] datetime: 2022-12-01 18:27:24 + 0: done with setup ... + 0: training ... + 0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +63: time (ms) | model-and-optimizer-setup: 32288.50 | train/valid/test-data-iterators-setup: 15170.30 +32: [001-001] 8.9057B / 8.4590B +32: [000-001] 8.9057B / 8.4590B + 0: [001-000] 8.9057B / 8.4590B + 0: [000-000] 8.9057B / 8.4590B + 0: [before the start of training step] datetime: 2022-12-01 18:27:24 + 0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 17215.4833984375 | max allocated: 59062.27294921875 | reserved: 63330.0 | max reserved: 63394.0 + 0: [Rank 1] (after 10 iterations) memory (MB) | allocated: 17215.4833984375 | max allocated: 59062.27294921875 | reserved: 62374.0 | max reserved: 62386.0 +32: [Rank 256] (after 10 iterations) memory (MB) | allocated: 18102.45556640625 | max allocated: 39591.79736328125 | reserved: 47514.0 | max reserved: 47514.0 +32: [Rank 257] (after 10 iterations) memory (MB) | allocated: 18102.45556640625 | max allocated: 39591.79736328125 | reserved: 46066.0 | max reserved: 46066.0 +63: iteration 10/ 5494 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 11.39 | learning rate: 3.640E-05 | global batch size: 1024 | lm loss: 1.421445E+01 | grad norm: 8.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 89.916 | TFLOPs: 20.10 | +63: iteration 20/ 5494 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 6.85 | learning rate: 7.280E-05 | global batch size: 1024 | lm loss: 1.027657E+01 | grad norm: 10.956 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 149.596 | TFLOPs: 33.44 | +63: iteration 30/ 5494 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 6.72 | learning rate: 1.092E-04 | global batch size: 1024 | lm loss: 7.730584E+00 | grad norm: 7.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 152.399 | TFLOPs: 34.07 | +63: iteration 40/ 5494 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 6.39 | learning rate: 1.456E-04 | global batch size: 1024 | lm loss: 7.357300E+00 | grad norm: 1.967 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 160.153 | TFLOPs: 35.80 | +63: iteration 50/ 5494 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 6.22 | learning rate: 1.820E-04 | global batch size: 1024 | lm loss: 7.289688E+00 | grad norm: 3.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 164.625 | TFLOPs: 36.80 | +63: iteration 60/ 5494 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 6.14 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 7.254121E+00 | grad norm: 3.062 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.739 | TFLOPs: 37.28 | +63: iteration 70/ 5494 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 5.80 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 7.174867E+00 | grad norm: 0.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.474 | TFLOPs: 39.45 | +63: iteration 80/ 5494 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 5.92 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 7.044849E+00 | grad norm: 3.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.882 | TFLOPs: 38.65 | +63: iteration 90/ 5494 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 6.05 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 6.953670E+00 | grad norm: 3.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.275 | TFLOPs: 37.84 | +63: iteration 100/ 5494 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 5.65 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 6.793136E+00 | grad norm: 1.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.152 | TFLOPs: 40.50 | +63: iteration 110/ 5494 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 5.97 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 6.744849E+00 | grad norm: 2.855 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.576 | TFLOPs: 38.36 | +63: iteration 120/ 5494 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 5.92 | learning rate: 1.999E-04 | global batch size: 1024 | lm loss: 6.649638E+00 | grad norm: 3.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.870 | TFLOPs: 38.65 | +63: iteration 130/ 5494 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 6.06 | learning rate: 1.999E-04 | global batch size: 1024 | lm loss: 6.565190E+00 | grad norm: 1.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.034 | TFLOPs: 37.79 | +63: iteration 140/ 5494 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 6.33 | learning rate: 1.999E-04 | global batch size: 1024 | lm loss: 6.485033E+00 | grad norm: 2.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 161.725 | TFLOPs: 36.16 | +63: iteration 150/ 5494 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 5.65 | learning rate: 1.999E-04 | global batch size: 1024 | lm loss: 6.449390E+00 | grad norm: 5.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.334 | TFLOPs: 40.54 | +63: iteration 160/ 5494 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 5.81 | learning rate: 1.998E-04 | global batch size: 1024 | lm loss: 6.369135E+00 | grad norm: 1.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.380 | TFLOPs: 39.43 | +63: iteration 170/ 5494 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 5.65 | learning rate: 1.998E-04 | global batch size: 1024 | lm loss: 6.273567E+00 | grad norm: 2.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.246 | TFLOPs: 40.52 | +63: iteration 180/ 5494 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 5.93 | learning rate: 1.998E-04 | global batch size: 1024 | lm loss: 6.211970E+00 | grad norm: 1.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.720 | TFLOPs: 38.61 | +63: iteration 190/ 5494 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 5.65 | learning rate: 1.997E-04 | global batch size: 1024 | lm loss: 6.150782E+00 | grad norm: 2.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.079 | TFLOPs: 40.48 | +63: iteration 200/ 5494 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 5.95 | learning rate: 1.997E-04 | global batch size: 1024 | lm loss: 6.075595E+00 | grad norm: 1.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.232 | TFLOPs: 38.51 | +63: iteration 210/ 5494 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 6.36 | learning rate: 1.996E-04 | global batch size: 1024 | lm loss: 5.975476E+00 | grad norm: 1.943 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 160.989 | TFLOPs: 35.99 | +63: iteration 220/ 5494 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 5.97 | learning rate: 1.996E-04 | global batch size: 1024 | lm loss: 5.902942E+00 | grad norm: 1.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.659 | TFLOPs: 38.38 | +63: iteration 230/ 5494 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 6.56 | learning rate: 1.995E-04 | global batch size: 1024 | lm loss: 5.869223E+00 | grad norm: 1.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 156.159 | TFLOPs: 34.91 | +63: iteration 240/ 5494 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 5.90 | learning rate: 1.995E-04 | global batch size: 1024 | lm loss: 5.777339E+00 | grad norm: 1.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.475 | TFLOPs: 38.78 | +63: iteration 250/ 5494 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 6.16 | learning rate: 1.994E-04 | global batch size: 1024 | lm loss: 5.708573E+00 | grad norm: 2.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.299 | TFLOPs: 37.18 | +63: iteration 260/ 5494 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 5.96 | learning rate: 1.994E-04 | global batch size: 1024 | lm loss: 5.681989E+00 | grad norm: 0.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.915 | TFLOPs: 38.43 | +63: iteration 270/ 5494 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 5.66 | learning rate: 1.993E-04 | global batch size: 1024 | lm loss: 5.602813E+00 | grad norm: 1.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.776 | TFLOPs: 40.42 | +63: iteration 280/ 5494 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 5.82 | learning rate: 1.992E-04 | global batch size: 1024 | lm loss: 5.550649E+00 | grad norm: 1.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.961 | TFLOPs: 39.34 | +63: iteration 290/ 5494 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 5.70 | learning rate: 1.992E-04 | global batch size: 1024 | lm loss: 5.516335E+00 | grad norm: 0.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.534 | TFLOPs: 40.14 | +63: iteration 300/ 5494 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 5.67 | learning rate: 1.991E-04 | global batch size: 1024 | lm loss: 5.492454E+00 | grad norm: 0.796 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.740 | TFLOPs: 40.41 | +63: iteration 310/ 5494 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 6.24 | learning rate: 1.990E-04 | global batch size: 1024 | lm loss: 5.392955E+00 | grad norm: 1.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 164.171 | TFLOPs: 36.70 | +63: iteration 320/ 5494 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 5.80 | learning rate: 1.989E-04 | global batch size: 1024 | lm loss: 5.315658E+00 | grad norm: 0.904 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.669 | TFLOPs: 39.50 | +63: iteration 330/ 5494 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 5.95 | learning rate: 1.989E-04 | global batch size: 1024 | lm loss: 5.250677E+00 | grad norm: 0.887 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.003 | TFLOPs: 38.45 | +63: iteration 340/ 5494 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 6.08 | learning rate: 1.988E-04 | global batch size: 1024 | lm loss: 5.219346E+00 | grad norm: 1.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.398 | TFLOPs: 37.65 | +63: iteration 350/ 5494 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 5.77 | learning rate: 1.987E-04 | global batch size: 1024 | lm loss: 5.201095E+00 | grad norm: 0.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.322 | TFLOPs: 39.64 | +63: iteration 360/ 5494 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 6.09 | learning rate: 1.986E-04 | global batch size: 1024 | lm loss: 5.142915E+00 | grad norm: 0.904 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.082 | TFLOPs: 37.58 | +63: iteration 370/ 5494 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 6.11 | learning rate: 1.985E-04 | global batch size: 1024 | lm loss: 5.099271E+00 | grad norm: 1.083 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.716 | TFLOPs: 37.50 | +63: iteration 380/ 5494 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 6.21 | learning rate: 1.984E-04 | global batch size: 1024 | lm loss: 5.058996E+00 | grad norm: 1.014 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 164.866 | TFLOPs: 36.86 | +63: iteration 390/ 5494 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 6.05 | learning rate: 1.983E-04 | global batch size: 1024 | lm loss: 4.998819E+00 | grad norm: 0.801 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.351 | TFLOPs: 37.86 | +63: iteration 400/ 5494 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 6.07 | learning rate: 1.982E-04 | global batch size: 1024 | lm loss: 4.957910E+00 | grad norm: 0.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.735 | TFLOPs: 37.72 | +63: iteration 410/ 5494 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 6.17 | learning rate: 1.981E-04 | global batch size: 1024 | lm loss: 4.906779E+00 | grad norm: 0.858 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.054 | TFLOPs: 37.12 | +63: iteration 420/ 5494 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 5.64 | learning rate: 1.980E-04 | global batch size: 1024 | lm loss: 4.880185E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.408 | TFLOPs: 40.56 | +63: iteration 430/ 5494 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 6.07 | learning rate: 1.979E-04 | global batch size: 1024 | lm loss: 4.827548E+00 | grad norm: 0.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.593 | TFLOPs: 37.69 | +63: iteration 440/ 5494 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 5.51 | learning rate: 1.978E-04 | global batch size: 1024 | lm loss: 4.810951E+00 | grad norm: 0.634 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.834 | TFLOPs: 41.55 | +63: iteration 450/ 5494 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 5.66 | learning rate: 1.977E-04 | global batch size: 1024 | lm loss: 4.748519E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.776 | TFLOPs: 40.42 | +63: iteration 460/ 5494 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 5.92 | learning rate: 1.975E-04 | global batch size: 1024 | lm loss: 4.706157E+00 | grad norm: 0.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.864 | TFLOPs: 38.65 | +63: iteration 470/ 5494 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 5.77 | learning rate: 1.974E-04 | global batch size: 1024 | lm loss: 4.636145E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.529 | TFLOPs: 39.69 | +63: iteration 480/ 5494 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 6.09 | learning rate: 1.973E-04 | global batch size: 1024 | lm loss: 4.586098E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.051 | TFLOPs: 37.57 | +63: iteration 490/ 5494 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 5.50 | learning rate: 1.972E-04 | global batch size: 1024 | lm loss: 4.543642E+00 | grad norm: 0.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.087 | TFLOPs: 41.60 | +63: iteration 500/ 5494 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 5.78 | learning rate: 1.970E-04 | global batch size: 1024 | lm loss: 4.562355E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.014 | TFLOPs: 39.57 | +63: iteration 510/ 5494 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 5.93 | learning rate: 1.969E-04 | global batch size: 1024 | lm loss: 4.480151E+00 | grad norm: 0.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.792 | TFLOPs: 38.63 | +63: iteration 520/ 5494 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 5.52 | learning rate: 1.968E-04 | global batch size: 1024 | lm loss: 4.421968E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.630 | TFLOPs: 41.50 | +63: iteration 530/ 5494 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 5.73 | learning rate: 1.966E-04 | global batch size: 1024 | lm loss: 4.347599E+00 | grad norm: 0.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.752 | TFLOPs: 39.96 | +63: iteration 540/ 5494 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 6.07 | learning rate: 1.965E-04 | global batch size: 1024 | lm loss: 4.303631E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.697 | TFLOPs: 37.72 | +63: iteration 550/ 5494 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 5.95 | learning rate: 1.963E-04 | global batch size: 1024 | lm loss: 4.276353E+00 | grad norm: 1.006 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.965 | TFLOPs: 38.45 | +63: iteration 560/ 5494 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 5.64 | learning rate: 1.962E-04 | global batch size: 1024 | lm loss: 4.223422E+00 | grad norm: 0.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.705 | TFLOPs: 40.62 | +63: iteration 570/ 5494 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 5.84 | learning rate: 1.960E-04 | global batch size: 1024 | lm loss: 4.134084E+00 | grad norm: 0.760 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.288 | TFLOPs: 39.19 | +63: iteration 580/ 5494 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 6.21 | learning rate: 1.959E-04 | global batch size: 1024 | lm loss: 4.107196E+00 | grad norm: 0.760 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 165.006 | TFLOPs: 36.89 | +63: iteration 590/ 5494 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 5.64 | learning rate: 1.957E-04 | global batch size: 1024 | lm loss: 4.009335E+00 | grad norm: 0.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.438 | TFLOPs: 40.56 | +63: iteration 600/ 5494 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 5.93 | learning rate: 1.956E-04 | global batch size: 1024 | lm loss: 4.005797E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.740 | TFLOPs: 38.62 | +63: iteration 610/ 5494 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 6.11 | learning rate: 1.954E-04 | global batch size: 1024 | lm loss: 3.922235E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.497 | TFLOPs: 37.45 | +63: iteration 620/ 5494 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 5.45 | learning rate: 1.952E-04 | global batch size: 1024 | lm loss: 3.860219E+00 | grad norm: 0.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.884 | TFLOPs: 42.00 | +63: iteration 630/ 5494 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 5.85 | learning rate: 1.951E-04 | global batch size: 1024 | lm loss: 3.799514E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.966 | TFLOPs: 39.12 | +63: iteration 640/ 5494 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 5.79 | learning rate: 1.949E-04 | global batch size: 1024 | lm loss: 3.737986E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.931 | TFLOPs: 39.56 | +63: iteration 650/ 5494 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 6.01 | learning rate: 1.947E-04 | global batch size: 1024 | lm loss: 3.701212E+00 | grad norm: 0.919 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.497 | TFLOPs: 38.12 | +63: iteration 660/ 5494 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 5.88 | learning rate: 1.946E-04 | global batch size: 1024 | lm loss: 3.710569E+00 | grad norm: 0.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.115 | TFLOPs: 38.93 | +63: iteration 670/ 5494 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 6.27 | learning rate: 1.944E-04 | global batch size: 1024 | lm loss: 3.604822E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 163.350 | TFLOPs: 36.52 | +63: iteration 680/ 5494 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 5.81 | learning rate: 1.942E-04 | global batch size: 1024 | lm loss: 3.569073E+00 | grad norm: 0.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.291 | TFLOPs: 39.41 | +63: iteration 690/ 5494 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 6.14 | learning rate: 1.940E-04 | global batch size: 1024 | lm loss: 3.506874E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.733 | TFLOPs: 37.28 | +63: iteration 700/ 5494 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 6.21 | learning rate: 1.938E-04 | global batch size: 1024 | lm loss: 3.525362E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 164.961 | TFLOPs: 36.88 | +63: iteration 710/ 5494 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 5.67 | learning rate: 1.936E-04 | global batch size: 1024 | lm loss: 3.479059E+00 | grad norm: 0.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.641 | TFLOPs: 40.39 | +63: iteration 720/ 5494 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 5.38 | learning rate: 1.934E-04 | global batch size: 1024 | lm loss: 3.451244E+00 | grad norm: 0.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.170 | TFLOPs: 42.52 | +63: iteration 730/ 5494 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 5.82 | learning rate: 1.932E-04 | global batch size: 1024 | lm loss: 3.402049E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.844 | TFLOPs: 39.31 | +63: iteration 740/ 5494 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 5.86 | learning rate: 1.930E-04 | global batch size: 1024 | lm loss: 3.362186E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.662 | TFLOPs: 39.05 | +63: iteration 750/ 5494 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 5.77 | learning rate: 1.928E-04 | global batch size: 1024 | lm loss: 3.364851E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.446 | TFLOPs: 39.67 | +63: iteration 760/ 5494 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 6.07 | learning rate: 1.926E-04 | global batch size: 1024 | lm loss: 3.299728E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.645 | TFLOPs: 37.70 | +63: iteration 770/ 5494 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 5.67 | learning rate: 1.924E-04 | global batch size: 1024 | lm loss: 3.337376E+00 | grad norm: 0.643 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.454 | TFLOPs: 40.34 | +63: iteration 780/ 5494 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 6.13 | learning rate: 1.922E-04 | global batch size: 1024 | lm loss: 3.306590E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.038 | TFLOPs: 37.34 | +63: iteration 790/ 5494 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 5.79 | learning rate: 1.920E-04 | global batch size: 1024 | lm loss: 3.250806E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.828 | TFLOPs: 39.53 | +63: iteration 800/ 5494 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 5.52 | learning rate: 1.918E-04 | global batch size: 1024 | lm loss: 3.208947E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.390 | TFLOPs: 41.45 | +63: iteration 810/ 5494 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 5.95 | learning rate: 1.916E-04 | global batch size: 1024 | lm loss: 3.216829E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.984 | TFLOPs: 38.45 | +63: iteration 820/ 5494 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 6.23 | learning rate: 1.914E-04 | global batch size: 1024 | lm loss: 3.208474E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 164.487 | TFLOPs: 36.77 | +63: iteration 830/ 5494 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 5.79 | learning rate: 1.911E-04 | global batch size: 1024 | lm loss: 3.190247E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.817 | TFLOPs: 39.53 | +63: iteration 840/ 5494 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 5.83 | learning rate: 1.909E-04 | global batch size: 1024 | lm loss: 3.177559E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.694 | TFLOPs: 39.28 | +63: iteration 850/ 5494 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 5.78 | learning rate: 1.907E-04 | global batch size: 1024 | lm loss: 3.152160E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.164 | TFLOPs: 39.61 | +63: iteration 860/ 5494 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 5.70 | learning rate: 1.904E-04 | global batch size: 1024 | lm loss: 3.119644E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.616 | TFLOPs: 40.16 | +63: iteration 870/ 5494 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 5.95 | learning rate: 1.902E-04 | global batch size: 1024 | lm loss: 3.117018E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.189 | TFLOPs: 38.50 | +63: iteration 880/ 5494 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 5.65 | learning rate: 1.900E-04 | global batch size: 1024 | lm loss: 3.085452E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.383 | TFLOPs: 40.55 | +63: iteration 890/ 5494 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 5.56 | learning rate: 1.897E-04 | global batch size: 1024 | lm loss: 3.084528E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.297 | TFLOPs: 41.20 | +63: iteration 900/ 5494 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 5.53 | learning rate: 1.895E-04 | global batch size: 1024 | lm loss: 3.080099E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.236 | TFLOPs: 41.41 | +63: iteration 910/ 5494 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 5.91 | learning rate: 1.892E-04 | global batch size: 1024 | lm loss: 3.068504E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.262 | TFLOPs: 38.74 | +63: iteration 920/ 5494 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 5.91 | learning rate: 1.890E-04 | global batch size: 1024 | lm loss: 3.039456E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.297 | TFLOPs: 38.74 | +63: iteration 930/ 5494 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 5.82 | learning rate: 1.887E-04 | global batch size: 1024 | lm loss: 3.045891E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.085 | TFLOPs: 39.37 | +63: iteration 940/ 5494 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 5.79 | learning rate: 1.885E-04 | global batch size: 1024 | lm loss: 3.013646E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.748 | TFLOPs: 39.52 | +63: iteration 950/ 5494 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 6.06 | learning rate: 1.882E-04 | global batch size: 1024 | lm loss: 3.009480E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.947 | TFLOPs: 37.77 | +63: iteration 960/ 5494 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 5.66 | learning rate: 1.880E-04 | global batch size: 1024 | lm loss: 3.001099E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.042 | TFLOPs: 40.47 | +63: iteration 970/ 5494 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 5.94 | learning rate: 1.877E-04 | global batch size: 1024 | lm loss: 2.957767E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.293 | TFLOPs: 38.52 | +63: iteration 980/ 5494 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 5.94 | learning rate: 1.875E-04 | global batch size: 1024 | lm loss: 3.026293E+00 | grad norm: 0.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.339 | TFLOPs: 38.53 | +63: iteration 990/ 5494 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 5.78 | learning rate: 1.872E-04 | global batch size: 1024 | lm loss: 3.006786E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.095 | TFLOPs: 39.59 | +63: iteration 1000/ 5494 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 5.65 | learning rate: 1.869E-04 | global batch size: 1024 | lm loss: 2.966990E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.364 | TFLOPs: 40.55 | +63: ------------------------------------------------------------------------------------------ +63: valid loss at iteration 1000 | lm loss value: 2.918611E+00 | lm loss PPL: 1.851555E+01 | +63: ------------------------------------------------------------------------------------------ + 0: saving checkpoint at iteration 1000 to checkpoints_8b7beta + 0: [2022-12-01 20:06:50,792] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! + 0: [2022-12-01 20:06:51,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_01-model_01-model_states.pt... + 0: [2022-12-01 20:06:51,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_01-model_00-model_states.pt... +32: [2022-12-01 20:06:51,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_24-model_01-model_states.pt... +32: [2022-12-01 20:06:51,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_24-model_00-model_states.pt... +32: [2022-12-01 20:06:52,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_24-model_01-model_states.pt. +32: [2022-12-01 20:06:52,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_24-model_00-model_states.pt. + 0: [2022-12-01 20:06:52,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_01-model_00-model_states.pt. + 0: [2022-12-01 20:06:52,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_01-model_01-model_states.pt. +32: [2022-12-01 20:06:52,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_25-model_01-model_states.pt... +32: [2022-12-01 20:06:52,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_25-model_00-model_states.pt... + 0: [2022-12-01 20:06:52,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_03-model_00-model_states.pt... + 0: [2022-12-01 20:06:52,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_03-model_01-model_states.pt... + 0: [2022-12-01 20:06:52,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_03-model_00-model_states.pt. + 0: [2022-12-01 20:06:52,728] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_04-model_00-model_states.pt... +32: [2022-12-01 20:06:52,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_25-model_01-model_states.pt. +32: [2022-12-01 20:06:52,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_26-model_01-model_states.pt... +32: [2022-12-01 20:06:52,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_25-model_00-model_states.pt. + 0: [2022-12-01 20:06:52,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_03-model_01-model_states.pt. +32: [2022-12-01 20:06:52,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_26-model_00-model_states.pt... + 0: [2022-12-01 20:06:52,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_04-model_01-model_states.pt... +32: [2022-12-01 20:06:52,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_26-model_01-model_states.pt. +32: [2022-12-01 20:06:52,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_27-model_01-model_states.pt... + 0: [2022-12-01 20:06:52,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_04-model_01-model_states.pt. + 0: [2022-12-01 20:06:52,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_04-model_00-model_states.pt. + 0: [2022-12-01 20:06:52,999] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_05-model_01-model_states.pt... + 0: [2022-12-01 20:06:52,999] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_05-model_00-model_states.pt... +32: [2022-12-01 20:06:53,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_26-model_00-model_states.pt. +32: [2022-12-01 20:06:53,006] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_27-model_00-model_states.pt... +32: [2022-12-01 20:06:53,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_27-model_01-model_states.pt. +32: [2022-12-01 20:06:53,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_28-model_01-model_states.pt... + 0: [2022-12-01 20:06:53,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_05-model_01-model_states.pt. + 0: [2022-12-01 20:06:53,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_05-model_00-model_states.pt. + 0: [2022-12-01 20:06:53,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_06-model_01-model_states.pt... + 0: [2022-12-01 20:06:53,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_06-model_00-model_states.pt... +32: [2022-12-01 20:06:53,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_27-model_00-model_states.pt. +32: [2022-12-01 20:06:53,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_28-model_00-model_states.pt... +32: [2022-12-01 20:06:53,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_28-model_00-model_states.pt. +32: [2022-12-01 20:06:53,485] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_29-model_00-model_states.pt... + 0: [2022-12-01 20:06:53,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_06-model_00-model_states.pt. + 0: [2022-12-01 20:06:53,498] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_07-model_00-model_states.pt... + 0: [2022-12-01 20:06:53,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_06-model_01-model_states.pt. + 0: [2022-12-01 20:06:53,503] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_07-model_01-model_states.pt... +32: [2022-12-01 20:06:53,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_28-model_01-model_states.pt. +32: [2022-12-01 20:06:53,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_29-model_01-model_states.pt... +32: [2022-12-01 20:06:53,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_29-model_00-model_states.pt. +32: [2022-12-01 20:06:53,716] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_30-model_00-model_states.pt... + 0: [2022-12-01 20:06:53,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_07-model_01-model_states.pt. + 0: [2022-12-01 20:06:53,743] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_08-model_01-model_states.pt... +32: [2022-12-01 20:06:53,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_29-model_01-model_states.pt. +32: [2022-12-01 20:06:53,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_30-model_01-model_states.pt... + 0: [2022-12-01 20:06:53,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_07-model_00-model_states.pt. + 0: [2022-12-01 20:06:53,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_08-model_00-model_states.pt... +32: [2022-12-01 20:06:53,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_30-model_00-model_states.pt. +32: [2022-12-01 20:06:53,941] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_31-model_00-model_states.pt... +32: [2022-12-01 20:06:53,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_30-model_01-model_states.pt. +32: [2022-12-01 20:06:53,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_31-model_01-model_states.pt... + 0: [2022-12-01 20:06:54,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_08-model_00-model_states.pt. + 0: [2022-12-01 20:06:54,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_08-model_01-model_states.pt. + 0: [2022-12-01 20:06:54,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_09-model_00-model_states.pt... + 0: [2022-12-01 20:06:54,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_09-model_01-model_states.pt... +32: [2022-12-01 20:06:54,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_31-model_00-model_states.pt. +32: [2022-12-01 20:06:54,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_32-model_00-model_states.pt... +32: [2022-12-01 20:06:54,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_31-model_01-model_states.pt. +32: [2022-12-01 20:06:54,202] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_32-model_01-model_states.pt... + 0: [2022-12-01 20:06:54,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_09-model_00-model_states.pt. + 0: [2022-12-01 20:06:54,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_10-model_00-model_states.pt... + 0: [2022-12-01 20:06:54,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_09-model_01-model_states.pt. + 0: [2022-12-01 20:06:54,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_10-model_01-model_states.pt... +32: [2022-12-01 20:06:54,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_32-model_00-model_states.pt. +32: [2022-12-01 20:06:54,411] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_33-model_00-model_states.pt... +32: [2022-12-01 20:06:54,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_32-model_01-model_states.pt. +32: [2022-12-01 20:06:54,433] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_33-model_01-model_states.pt... + 0: [2022-12-01 20:06:54,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_10-model_01-model_states.pt. + 0: [2022-12-01 20:06:54,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_11-model_01-model_states.pt... + 0: [2022-12-01 20:06:54,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_10-model_00-model_states.pt. + 0: [2022-12-01 20:06:54,541] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_11-model_00-model_states.pt... +32: [2022-12-01 20:06:54,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_33-model_00-model_states.pt. +32: [2022-12-01 20:06:54,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_34-model_00-model_states.pt... +32: [2022-12-01 20:06:54,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_33-model_01-model_states.pt. +32: [2022-12-01 20:06:54,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_34-model_01-model_states.pt... + 0: [2022-12-01 20:06:54,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_11-model_01-model_states.pt. + 0: [2022-12-01 20:06:54,779] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_12-model_01-model_states.pt... + 0: [2022-12-01 20:06:54,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_11-model_00-model_states.pt. + 0: [2022-12-01 20:06:54,799] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_12-model_00-model_states.pt... +32: [2022-12-01 20:06:54,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_34-model_00-model_states.pt. +32: [2022-12-01 20:06:54,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_35-model_00-model_states.pt... +32: [2022-12-01 20:06:54,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_34-model_01-model_states.pt. +32: [2022-12-01 20:06:54,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_35-model_01-model_states.pt... + 0: [2022-12-01 20:06:55,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_12-model_01-model_states.pt. + 0: [2022-12-01 20:06:55,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_13-model_01-model_states.pt... + 0: [2022-12-01 20:06:55,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_12-model_00-model_states.pt. + 0: [2022-12-01 20:06:55,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_13-model_00-model_states.pt... +32: [2022-12-01 20:06:55,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_35-model_00-model_states.pt. +32: [2022-12-01 20:06:55,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_36-model_00-model_states.pt... +32: [2022-12-01 20:06:55,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_35-model_01-model_states.pt. +32: [2022-12-01 20:06:55,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_36-model_01-model_states.pt... + 0: [2022-12-01 20:06:55,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_13-model_01-model_states.pt. + 0: [2022-12-01 20:06:55,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_14-model_01-model_states.pt... +32: [2022-12-01 20:06:55,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_36-model_00-model_states.pt. +32: [2022-12-01 20:06:55,317] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_37-model_00-model_states.pt... + 0: [2022-12-01 20:06:55,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_13-model_00-model_states.pt. + 0: [2022-12-01 20:06:55,326] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_14-model_00-model_states.pt... +32: [2022-12-01 20:06:55,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_36-model_01-model_states.pt. +32: [2022-12-01 20:06:55,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_37-model_01-model_states.pt... +32: [2022-12-01 20:06:55,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_37-model_00-model_states.pt. +32: [2022-12-01 20:06:55,536] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_38-model_00-model_states.pt... +32: [2022-12-01 20:06:55,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_37-model_01-model_states.pt. +32: [2022-12-01 20:06:55,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_38-model_01-model_states.pt... + 0: [2022-12-01 20:06:55,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_14-model_01-model_states.pt. + 0: [2022-12-01 20:06:55,565] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_15-model_01-model_states.pt... + 0: [2022-12-01 20:06:55,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_14-model_00-model_states.pt. + 0: [2022-12-01 20:06:55,573] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_15-model_00-model_states.pt... +32: [2022-12-01 20:06:55,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_38-model_00-model_states.pt. +32: [2022-12-01 20:06:55,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_39-model_00-model_states.pt... +32: [2022-12-01 20:06:55,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_38-model_01-model_states.pt. +32: [2022-12-01 20:06:55,803] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_39-model_01-model_states.pt... + 0: [2022-12-01 20:06:55,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_15-model_01-model_states.pt. + 0: [2022-12-01 20:06:55,834] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_16-model_01-model_states.pt... + 0: [2022-12-01 20:06:55,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_15-model_00-model_states.pt. + 0: [2022-12-01 20:06:55,842] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_16-model_00-model_states.pt... +32: [2022-12-01 20:06:55,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_39-model_00-model_states.pt. +32: [2022-12-01 20:06:55,980] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_40-model_00-model_states.pt... +32: [2022-12-01 20:06:56,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_39-model_01-model_states.pt. +32: [2022-12-01 20:06:56,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_40-model_01-model_states.pt... + 0: [2022-12-01 20:06:56,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_16-model_00-model_states.pt. + 0: [2022-12-01 20:06:56,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_17-model_00-model_states.pt... + 0: [2022-12-01 20:06:56,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_16-model_01-model_states.pt. + 0: [2022-12-01 20:06:56,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_17-model_01-model_states.pt... +32: [2022-12-01 20:06:56,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_40-model_00-model_states.pt. +32: [2022-12-01 20:06:56,215] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_41-model_00-model_states.pt... +32: [2022-12-01 20:06:56,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_40-model_01-model_states.pt. +32: [2022-12-01 20:06:56,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_41-model_01-model_states.pt... + 0: [2022-12-01 20:06:56,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_17-model_00-model_states.pt. + 0: [2022-12-01 20:06:56,341] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_18-model_00-model_states.pt... + 0: [2022-12-01 20:06:56,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_17-model_01-model_states.pt. + 0: [2022-12-01 20:06:56,348] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_18-model_01-model_states.pt... +32: [2022-12-01 20:06:56,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_41-model_00-model_states.pt. +32: [2022-12-01 20:06:56,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_42-model_00-model_states.pt... +32: [2022-12-01 20:06:56,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_41-model_01-model_states.pt. +32: [2022-12-01 20:06:56,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_42-model_01-model_states.pt... + 0: [2022-12-01 20:06:56,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_18-model_00-model_states.pt. + 0: [2022-12-01 20:06:56,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_18-model_01-model_states.pt. + 0: [2022-12-01 20:06:56,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_19-model_00-model_states.pt... + 0: [2022-12-01 20:06:56,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_19-model_01-model_states.pt... +32: [2022-12-01 20:06:56,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_42-model_00-model_states.pt. +32: [2022-12-01 20:06:56,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_43-model_00-model_states.pt... +32: [2022-12-01 20:06:56,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_42-model_01-model_states.pt. +32: [2022-12-01 20:06:56,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_43-model_01-model_states.pt... + 0: [2022-12-01 20:06:56,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_19-model_00-model_states.pt. + 0: [2022-12-01 20:06:56,869] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_20-model_00-model_states.pt... + 0: [2022-12-01 20:06:56,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_19-model_01-model_states.pt. + 0: [2022-12-01 20:06:56,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_20-model_01-model_states.pt... +32: [2022-12-01 20:06:56,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_43-model_00-model_states.pt. +32: [2022-12-01 20:06:56,890] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_44-model_00-model_states.pt... +32: [2022-12-01 20:06:56,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_43-model_01-model_states.pt. +32: [2022-12-01 20:06:56,927] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_44-model_01-model_states.pt... +32: [2022-12-01 20:06:57,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_44-model_00-model_states.pt. +32: [2022-12-01 20:06:57,112] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_46-model_00-model_states.pt... +32: [2022-12-01 20:06:57,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_46-model_00-model_states.pt. +32: [2022-12-01 20:06:57,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/mp_rank_02_model_states.pt... + 0: [2022-12-01 20:06:57,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_20-model_01-model_states.pt. + 0: [2022-12-01 20:06:57,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_21-model_01-model_states.pt... + 0: [2022-12-01 20:06:57,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_20-model_00-model_states.pt. + 0: [2022-12-01 20:06:57,144] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_21-model_00-model_states.pt... +32: [2022-12-01 20:06:57,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_44-model_01-model_states.pt. +32: [2022-12-01 20:06:57,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_46-model_01-model_states.pt... +32: [2022-12-01 20:06:57,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_46-model_01-model_states.pt. +32: [2022-12-01 20:06:57,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/mp_rank_02_model_states.pt. +32: [2022-12-01 20:06:57,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/mp_rank_03_model_states.pt... +32: [2022-12-01 20:06:57,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/mp_rank_03_model_states.pt. + 0: [2022-12-01 20:06:57,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_21-model_00-model_states.pt. + 0: [2022-12-01 20:06:57,384] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_22-model_00-model_states.pt... + 0: [2022-12-01 20:06:57,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_21-model_01-model_states.pt. + 0: [2022-12-01 20:06:57,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_22-model_01-model_states.pt... + 0: [2022-12-01 20:06:57,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_22-model_01-model_states.pt. + 0: [2022-12-01 20:06:57,631] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_23-model_01-model_states.pt... + 0: [2022-12-01 20:06:57,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_22-model_00-model_states.pt. + 0: [2022-12-01 20:06:57,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/layer_23-model_00-model_states.pt... + 0: [2022-12-01 20:06:57,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_23-model_01-model_states.pt. + 0: [2022-12-01 20:06:57,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/layer_23-model_00-model_states.pt. + 0: [2022-12-01 20:06:57,876] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7beta/global_step1000/mp_rank_01_model_states.pt + 0: [2022-12-01 20:06:57,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/mp_rank_01_model_states.pt... + 0: [2022-12-01 20:06:57,877] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7beta/global_step1000/mp_rank_00_model_states.pt + 0: [2022-12-01 20:06:57,877] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/mp_rank_00_model_states.pt... + 0: [2022-12-01 20:06:57,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/mp_rank_00_model_states.pt. + 0: [2022-12-01 20:06:57,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/mp_rank_01_model_states.pt. + 0: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... +32: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... + 0: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +32: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... + 0: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... + 0: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... +54: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... +54: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... +54: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... +54: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... +56: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... +56: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... +56: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... +56: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... +62: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... +62: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... +62: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... +60: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... +60: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... +60: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... +60: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... +51: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... +51: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... +51: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... +51: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... +63: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... +63: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... +63: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... +63: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... +39: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... +39: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... +39: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... +57: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... +57: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... +57: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... +57: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... +43: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... +43: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... +38: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... +38: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... +38: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... +38: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... +36: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... +36: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... +42: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... +42: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... +42: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... +42: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... +58: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... +58: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... +58: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... +58: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... +44: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... +44: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... +44: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... +34: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... +34: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... +34: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... +34: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... +32: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... +50: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... +50: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... +46: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... +46: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... +46: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... +26: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... +26: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... +49: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... +49: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... +49: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... +49: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... +41: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... +41: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... +41: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... +55: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... +55: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... +55: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... +55: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... +37: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... + 8: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... + 8: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... + 8: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... + 8: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... +33: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... +33: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... +33: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... +33: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... +52: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... +52: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... +52: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... +61: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... +61: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... +61: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... +61: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... +45: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... +45: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... +45: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... +45: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... +47: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... +47: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... +47: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... +47: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... +59: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... +59: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... +59: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... +59: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... +12: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... +12: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... +12: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... +12: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... +29: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... +29: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... +29: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... +35: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... +35: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... +35: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... +35: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... +48: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... +48: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... +48: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... +48: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... +53: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... +53: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... +53: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... +53: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... +40: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... +40: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... +40: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... +40: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... +16: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... +16: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... + 0: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... +54: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... +54: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... +54: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... +56: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... +56: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... +62: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... + 4: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... + 4: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... + 4: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... +60: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... +60: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... +51: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... +63: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... +63: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... +63: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... +17: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... +17: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... +17: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... +17: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... + 1: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... + 1: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... + 1: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... + 1: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... +39: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... + 7: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... + 7: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... + 7: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... +11: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... +11: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... +11: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... +57: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... +27: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... +27: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... +27: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... +43: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... +25: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... +25: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... +25: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... +25: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... +13: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... +13: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... +14: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... +14: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... +38: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... +38: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... +38: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... +22: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... +36: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... +36: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... +36: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... +36: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... +24: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... +24: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... +24: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... +42: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... +42: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... +42: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... +58: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... +44: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... +44: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... +44: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... +34: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... +28: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... +28: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... +32: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... +32: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... +50: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... +50: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... +46: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... +26: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... +49: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... +49: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... + 5: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... + 5: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... + 5: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... + 5: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... +20: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... +20: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... +41: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... +41: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... +41: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... +55: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... +55: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... +55: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... +55: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... +10: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... +10: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... +10: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... +37: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... +37: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... +37: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... + 8: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +18: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... +21: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... +21: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... +21: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... +21: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... +33: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... +33: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... +52: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... + 6: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... + 6: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... + 6: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... + 6: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... +31: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... +31: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... +31: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... +61: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... +61: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... +15: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... +15: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... +15: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... +15: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... +45: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... +45: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... +47: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... +47: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... + 3: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... + 3: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... + 9: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... + 9: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... + 9: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... +59: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... +59: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... +59: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... +12: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +12: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +12: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... + 2: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... + 2: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... + 2: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... + 2: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... +29: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... +35: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... +35: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... +19: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... +19: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... +19: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... +19: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... +48: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... +48: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... +48: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... +53: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... +53: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... +53: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... +40: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... +40: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... +40: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... +30: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... +30: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... +23: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... +23: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... +16: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... +16: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... + 0: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +54: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... +56: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... +62: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... + 4: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... +60: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... +51: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... +51: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... +51: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... +63: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... +17: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... +17: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... +17: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... +17: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... + 1: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... + 1: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... + 1: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... + 1: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +39: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... +39: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... +39: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... + 7: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... +11: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... +57: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... +27: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... +43: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... +43: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... +25: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... +13: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... +14: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... +38: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... +22: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... +22: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... +36: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... +24: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... +42: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... +58: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... +58: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... +44: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... +34: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... +34: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... +28: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... +32: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... +50: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... +50: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... +50: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... +50: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... +46: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... +46: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... +46: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... +26: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... +49: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... +49: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... + 5: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... + 5: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +20: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... +41: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... +10: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... +37: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... +37: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... +37: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... +37: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... + 8: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... + 8: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +18: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... +18: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... +21: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... +21: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... +33: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... +33: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... +52: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... +52: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... + 6: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +31: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... +61: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... +15: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +15: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +15: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +45: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... +45: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... +47: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... +47: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... + 3: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... + 9: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... + 9: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +59: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... +12: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... + 2: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... + 2: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +29: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... +35: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... +35: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... +19: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... +19: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... +19: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... +48: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... +53: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... +40: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... +30: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... +23: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... +23: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... +16: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... +16: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... +16: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... + 0: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +56: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... +62: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... +62: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... +62: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... + 4: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... + 4: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... + 4: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +60: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... +39: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... + 7: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... + 7: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +11: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +57: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... +27: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... +27: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... +27: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... +43: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... +25: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... +25: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... +13: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +13: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +14: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... +22: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... +22: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... +36: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... +24: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... +58: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... +44: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... +34: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... +28: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... +32: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... +46: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... +26: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... +26: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... + 5: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... + 5: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +20: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... +41: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... +10: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +10: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... + 8: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +18: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... +18: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... +18: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... +21: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... +21: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... +52: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... + 6: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... + 6: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +31: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... +31: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... +31: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... +61: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... +15: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... + 3: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... + 9: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... + 2: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +29: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... +29: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... +19: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... +30: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... +23: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... + 0: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... + 4: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... + 7: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +11: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +11: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +57: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... +27: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... +43: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... +43: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... +25: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... +13: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +13: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +14: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +14: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +22: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... +22: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... +22: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... +24: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... +24: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... +28: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... +32: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... +26: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... +20: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... +10: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +18: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... +52: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... + 6: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +31: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... + 3: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... + 9: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... + 9: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... + 2: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +29: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... +30: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... +30: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... +30: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... +23: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... +23: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... +16: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... + 7: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +11: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +13: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... +14: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +24: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... +28: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... +26: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... +20: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... +20: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... +10: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +18: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... + 3: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... + 3: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +23: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... +14: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +28: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... +20: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... + 3: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... +30: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... +28: [2022-12-01 20:06:58,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... + 0: [2022-12-01 20:06:58,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. + 0: [2022-12-01 20:06:58,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt + 0: [2022-12-01 20:06:58,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 0: [2022-12-01 20:06:58,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. + 0: [2022-12-01 20:06:58,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. + 0: [2022-12-01 20:06:58,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt + 0: [2022-12-01 20:06:58,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 0: [2022-12-01 20:06:58,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. + 0: [2022-12-01 20:06:58,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. + 0: [2022-12-01 20:06:58,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt + 0: [2022-12-01 20:06:58,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt + 0: [2022-12-01 20:06:58,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 0: [2022-12-01 20:06:58,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +32: [2022-12-01 20:06:58,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. +32: [2022-12-01 20:06:58,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt +32: [2022-12-01 20:06:58,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +32: [2022-12-01 20:06:58,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. +32: [2022-12-01 20:06:58,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt +32: [2022-12-01 20:06:58,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 0: [2022-12-01 20:06:58,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. + 0: [2022-12-01 20:06:58,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. + 0: [2022-12-01 20:06:58,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. + 0: [2022-12-01 20:06:58,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt + 0: [2022-12-01 20:06:58,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt + 0: [2022-12-01 20:06:58,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt + 0: [2022-12-01 20:06:58,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 0: [2022-12-01 20:06:58,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 0: [2022-12-01 20:06:58,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +32: [2022-12-01 20:06:58,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. +32: [2022-12-01 20:06:58,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. +32: [2022-12-01 20:06:58,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt +32: [2022-12-01 20:06:58,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt +32: [2022-12-01 20:06:58,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +32: [2022-12-01 20:06:58,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +32: [2022-12-01 20:06:58,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. +32: [2022-12-01 20:06:58,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt +32: [2022-12-01 20:06:58,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +32: [2022-12-01 20:06:58,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. +32: [2022-12-01 20:06:58,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt +32: [2022-12-01 20:06:58,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. +32: [2022-12-01 20:06:58,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +32: [2022-12-01 20:06:58,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt +32: [2022-12-01 20:06:58,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +33: [2022-12-01 20:06:58,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. +33: [2022-12-01 20:06:58,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. +33: [2022-12-01 20:06:58,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. +33: [2022-12-01 20:06:58,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. +33: [2022-12-01 20:06:58,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt +33: [2022-12-01 20:06:58,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt +33: [2022-12-01 20:06:58,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt +33: [2022-12-01 20:06:58,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +33: [2022-12-01 20:06:58,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +33: [2022-12-01 20:06:58,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt +33: [2022-12-01 20:06:58,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +33: [2022-12-01 20:06:58,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +33: [2022-12-01 20:06:58,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. +33: [2022-12-01 20:06:58,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt +33: [2022-12-01 20:06:58,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +33: [2022-12-01 20:06:58,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. +33: [2022-12-01 20:06:58,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt +33: [2022-12-01 20:06:58,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +33: [2022-12-01 20:06:58,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. +33: [2022-12-01 20:06:58,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt +33: [2022-12-01 20:06:58,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 7: [2022-12-01 20:06:58,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. + 7: [2022-12-01 20:06:58,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. + 7: [2022-12-01 20:06:58,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. + 7: [2022-12-01 20:06:58,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. + 7: [2022-12-01 20:06:58,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt + 7: [2022-12-01 20:06:58,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt + 7: [2022-12-01 20:06:58,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt + 7: [2022-12-01 20:06:58,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt + 7: [2022-12-01 20:06:58,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 7: [2022-12-01 20:06:58,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 7: [2022-12-01 20:06:58,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 7: [2022-12-01 20:06:58,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +33: [2022-12-01 20:06:58,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. +33: [2022-12-01 20:06:58,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt +33: [2022-12-01 20:06:58,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +44: [2022-12-01 20:06:58,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. +44: [2022-12-01 20:06:58,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt +44: [2022-12-01 20:06:58,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +44: [2022-12-01 20:06:58,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. +44: [2022-12-01 20:06:58,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt +44: [2022-12-01 20:06:58,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +32: [2022-12-01 20:06:58,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. +32: [2022-12-01 20:06:58,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt +32: [2022-12-01 20:06:58,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 7: [2022-12-01 20:06:58,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. + 7: [2022-12-01 20:06:58,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt + 7: [2022-12-01 20:06:58,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 7: [2022-12-01 20:06:58,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. + 7: [2022-12-01 20:06:58,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt + 7: [2022-12-01 20:06:58,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +28: [2022-12-01 20:06:58,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. +28: [2022-12-01 20:06:58,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. +28: [2022-12-01 20:06:58,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt +28: [2022-12-01 20:06:58,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt +28: [2022-12-01 20:06:58,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +28: [2022-12-01 20:06:58,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +44: [2022-12-01 20:06:58,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. +44: [2022-12-01 20:06:58,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt +44: [2022-12-01 20:06:58,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +52: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. +52: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. +52: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. +52: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. +61: [2022-12-01 20:06:58,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. +61: [2022-12-01 20:06:58,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. +30: [2022-12-01 20:06:58,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. +30: [2022-12-01 20:06:58,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. +30: [2022-12-01 20:06:58,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. +30: [2022-12-01 20:06:58,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. +30: [2022-12-01 20:06:58,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. +30: [2022-12-01 20:06:58,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. +30: [2022-12-01 20:06:58,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. +30: [2022-12-01 20:06:58,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt +30: [2022-12-01 20:06:58,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt +30: [2022-12-01 20:06:58,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +30: [2022-12-01 20:06:58,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt +30: [2022-12-01 20:06:58,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt +30: [2022-12-01 20:06:58,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt +30: [2022-12-01 20:06:58,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt +30: [2022-12-01 20:06:58,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt +30: [2022-12-01 20:06:58,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +30: [2022-12-01 20:06:58,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +30: [2022-12-01 20:06:58,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +30: [2022-12-01 20:06:58,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +30: [2022-12-01 20:06:58,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +30: [2022-12-01 20:06:58,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +30: [2022-12-01 20:06:58,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. +30: [2022-12-01 20:06:58,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt +30: [2022-12-01 20:06:58,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +61: [2022-12-01 20:06:58,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. +61: [2022-12-01 20:06:58,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. +61: [2022-12-01 20:06:58,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. +61: [2022-12-01 20:06:58,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt +61: [2022-12-01 20:06:58,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt +61: [2022-12-01 20:06:58,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt +61: [2022-12-01 20:06:58,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +61: [2022-12-01 20:06:58,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt +61: [2022-12-01 20:06:58,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt +61: [2022-12-01 20:06:58,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +61: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +61: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +61: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +52: [2022-12-01 20:06:58,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt +52: [2022-12-01 20:06:58,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt +52: [2022-12-01 20:06:58,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt +52: [2022-12-01 20:06:58,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt +52: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +52: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +52: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +52: [2022-12-01 20:06:58,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 1: [2022-12-01 20:06:58,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. + 1: [2022-12-01 20:06:58,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. + 1: [2022-12-01 20:06:58,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. + 1: [2022-12-01 20:06:58,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. + 1: [2022-12-01 20:06:58,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt + 1: [2022-12-01 20:06:58,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt + 1: [2022-12-01 20:06:58,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt + 1: [2022-12-01 20:06:58,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 1: [2022-12-01 20:06:58,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 1: [2022-12-01 20:06:58,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt + 1: [2022-12-01 20:06:58,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 1: [2022-12-01 20:06:58,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +61: [2022-12-01 20:06:58,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. +55: [2022-12-01 20:06:58,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt +55: [2022-12-01 20:06:58,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt +55: [2022-12-01 20:06:58,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt +55: [2022-12-01 20:06:58,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt +55: [2022-12-01 20:06:58,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt +55: [2022-12-01 20:06:58,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +55: [2022-12-01 20:06:58,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt +55: [2022-12-01 20:06:58,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +55: [2022-12-01 20:06:58,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 9: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. + 9: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. + 9: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. + 7: [2022-12-01 20:06:58,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. + 9: [2022-12-01 20:06:58,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt + 9: [2022-12-01 20:06:58,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt + 9: [2022-12-01 20:06:58,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt + 9: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 9: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 9: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 7: [2022-12-01 20:06:58,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt + 7: [2022-12-01 20:06:58,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +28: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. +28: [2022-12-01 20:06:58,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt +28: [2022-12-01 20:06:58,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +28: [2022-12-01 20:06:58,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. +28: [2022-12-01 20:06:58,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt +28: [2022-12-01 20:06:58,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. + 1: [2022-12-01 20:06:58,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. +27: [2022-12-01 20:06:58,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt +27: [2022-12-01 20:06:58,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt +27: [2022-12-01 20:06:58,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt +27: [2022-12-01 20:06:58,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt +27: [2022-12-01 20:06:58,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt +27: [2022-12-01 20:06:58,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt +27: [2022-12-01 20:06:58,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +27: [2022-12-01 20:06:58,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +27: [2022-12-01 20:06:58,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +45: [2022-12-01 20:06:58,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. +45: [2022-12-01 20:06:58,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. +45: [2022-12-01 20:06:58,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. +45: [2022-12-01 20:06:58,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. +45: [2022-12-01 20:06:58,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. +45: [2022-12-01 20:06:58,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt +45: [2022-12-01 20:06:58,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt +45: [2022-12-01 20:06:58,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt +45: [2022-12-01 20:06:58,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt +45: [2022-12-01 20:06:58,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +45: [2022-12-01 20:06:58,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +45: [2022-12-01 20:06:58,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +45: [2022-12-01 20:06:58,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +45: [2022-12-01 20:06:58,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt +45: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +45: [2022-12-01 20:06:58,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. +45: [2022-12-01 20:06:58,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt +45: [2022-12-01 20:06:58,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. +58: [2022-12-01 20:06:58,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt +58: [2022-12-01 20:06:58,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt +58: [2022-12-01 20:06:58,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt +58: [2022-12-01 20:06:58,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt +58: [2022-12-01 20:06:58,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +58: [2022-12-01 20:06:58,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +58: [2022-12-01 20:06:58,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. +46: [2022-12-01 20:06:58,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt +46: [2022-12-01 20:06:58,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt +46: [2022-12-01 20:06:58,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt +46: [2022-12-01 20:06:58,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt +46: [2022-12-01 20:06:58,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:58,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:58,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. + 3: [2022-12-01 20:06:58,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt + 3: [2022-12-01 20:06:58,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. + 3: [2022-12-01 20:06:58,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt + 3: [2022-12-01 20:06:58,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 3: [2022-12-01 20:06:58,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 3: [2022-12-01 20:06:58,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt + 3: [2022-12-01 20:06:58,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 3: [2022-12-01 20:06:58,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 3: [2022-12-01 20:06:58,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +11: [2022-12-01 20:06:58,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +11: [2022-12-01 20:06:58,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. +11: [2022-12-01 20:06:58,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +11: [2022-12-01 20:06:58,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. +11: [2022-12-01 20:06:58,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt +11: [2022-12-01 20:06:58,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt +11: [2022-12-01 20:06:58,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +11: [2022-12-01 20:06:58,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +11: [2022-12-01 20:06:58,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +11: [2022-12-01 20:06:58,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +11: [2022-12-01 20:06:58,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +11: [2022-12-01 20:06:58,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +61: [2022-12-01 20:06:58,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt +61: [2022-12-01 20:06:58,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. + 1: [2022-12-01 20:06:58,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt + 1: [2022-12-01 20:06:58,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. +50: [2022-12-01 20:06:58,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt +50: [2022-12-01 20:06:58,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt +50: [2022-12-01 20:06:58,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt +50: [2022-12-01 20:06:58,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt +50: [2022-12-01 20:06:58,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt +50: [2022-12-01 20:06:58,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt +50: [2022-12-01 20:06:58,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +50: [2022-12-01 20:06:58,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:58,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt +46: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +14: [2022-12-01 20:06:58,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +14: [2022-12-01 20:06:58,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt +14: [2022-12-01 20:06:58,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt +14: [2022-12-01 20:06:58,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +14: [2022-12-01 20:06:58,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +14: [2022-12-01 20:06:58,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +14: [2022-12-01 20:06:58,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +14: [2022-12-01 20:06:58,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 9: [2022-12-01 20:06:58,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. + 9: [2022-12-01 20:06:58,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt + 9: [2022-12-01 20:06:58,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +53: [2022-12-01 20:06:58,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. +10: [2022-12-01 20:06:58,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +11: [2022-12-01 20:06:58,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. +11: [2022-12-01 20:06:58,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt +11: [2022-12-01 20:06:58,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +53: [2022-12-01 20:06:58,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. +53: [2022-12-01 20:06:58,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt +53: [2022-12-01 20:06:58,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt +53: [2022-12-01 20:06:58,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +53: [2022-12-01 20:06:58,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +53: [2022-12-01 20:06:58,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. +53: [2022-12-01 20:06:58,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. +53: [2022-12-01 20:06:58,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt +53: [2022-12-01 20:06:58,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt +53: [2022-12-01 20:06:58,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +53: [2022-12-01 20:06:58,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +53: [2022-12-01 20:06:58,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. +53: [2022-12-01 20:06:58,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. +53: [2022-12-01 20:06:58,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt +53: [2022-12-01 20:06:58,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt +53: [2022-12-01 20:06:58,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +53: [2022-12-01 20:06:58,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +10: [2022-12-01 20:06:58,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +10: [2022-12-01 20:06:58,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +51: [2022-12-01 20:06:58,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. +51: [2022-12-01 20:06:58,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. +51: [2022-12-01 20:06:58,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. +51: [2022-12-01 20:06:58,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. +51: [2022-12-01 20:06:58,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. +51: [2022-12-01 20:06:58,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt +51: [2022-12-01 20:06:58,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. +51: [2022-12-01 20:06:58,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. +51: [2022-12-01 20:06:58,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt +51: [2022-12-01 20:06:58,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +51: [2022-12-01 20:06:58,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt +51: [2022-12-01 20:06:58,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt +51: [2022-12-01 20:06:58,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt +51: [2022-12-01 20:06:58,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt +51: [2022-12-01 20:06:58,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +51: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +51: [2022-12-01 20:06:58,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt +51: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +51: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +51: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +51: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +22: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. +22: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. +22: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. +22: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. +61: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. +40: [2022-12-01 20:06:58,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. +40: [2022-12-01 20:06:58,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. +40: [2022-12-01 20:06:58,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. +40: [2022-12-01 20:06:58,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. +40: [2022-12-01 20:06:58,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. +40: [2022-12-01 20:06:58,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. +40: [2022-12-01 20:06:58,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. +40: [2022-12-01 20:06:58,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. +40: [2022-12-01 20:06:58,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt +40: [2022-12-01 20:06:58,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. +40: [2022-12-01 20:06:58,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt +40: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +40: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +40: [2022-12-01 20:06:58,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt +40: [2022-12-01 20:06:58,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt +40: [2022-12-01 20:06:58,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt +40: [2022-12-01 20:06:58,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt +40: [2022-12-01 20:06:58,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt +40: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +40: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +40: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +40: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +40: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +40: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +22: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. +22: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. +22: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. +22: [2022-12-01 20:06:58,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. +22: [2022-12-01 20:06:58,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt +22: [2022-12-01 20:06:58,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt +22: [2022-12-01 20:06:58,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt +22: [2022-12-01 20:06:58,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt +22: [2022-12-01 20:06:58,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt +22: [2022-12-01 20:06:58,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt +22: [2022-12-01 20:06:58,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +22: [2022-12-01 20:06:58,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt +22: [2022-12-01 20:06:58,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +22: [2022-12-01 20:06:58,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +22: [2022-12-01 20:06:58,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt +22: [2022-12-01 20:06:58,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +22: [2022-12-01 20:06:58,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +22: [2022-12-01 20:06:58,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +22: [2022-12-01 20:06:58,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +22: [2022-12-01 20:06:58,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +61: [2022-12-01 20:06:58,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt +61: [2022-12-01 20:06:58,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +61: [2022-12-01 20:06:58,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. +61: [2022-12-01 20:06:58,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt +61: [2022-12-01 20:06:58,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +10: [2022-12-01 20:06:58,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. + 8: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt + 8: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt + 8: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt + 8: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt + 8: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 8: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +10: [2022-12-01 20:06:58,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt +10: [2022-12-01 20:06:58,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +12: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt +12: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt +12: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +12: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +12: [2022-12-01 20:06:58,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +12: [2022-12-01 20:06:58,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 9: [2022-12-01 20:06:58,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. +19: [2022-12-01 20:06:58,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. +19: [2022-12-01 20:06:58,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. +19: [2022-12-01 20:06:58,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. + 9: [2022-12-01 20:06:58,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt + 9: [2022-12-01 20:06:58,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +19: [2022-12-01 20:06:58,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. +19: [2022-12-01 20:06:58,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt +19: [2022-12-01 20:06:58,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt +19: [2022-12-01 20:06:58,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt +19: [2022-12-01 20:06:58,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt +19: [2022-12-01 20:06:58,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +19: [2022-12-01 20:06:58,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +19: [2022-12-01 20:06:58,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +19: [2022-12-01 20:06:58,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +36: [2022-12-01 20:06:58,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. +36: [2022-12-01 20:06:58,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. +36: [2022-12-01 20:06:58,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. +10: [2022-12-01 20:06:58,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. +36: [2022-12-01 20:06:58,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. +36: [2022-12-01 20:06:58,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. +36: [2022-12-01 20:06:58,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt +36: [2022-12-01 20:06:58,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt +36: [2022-12-01 20:06:58,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt +36: [2022-12-01 20:06:58,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt +36: [2022-12-01 20:06:58,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +36: [2022-12-01 20:06:58,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +36: [2022-12-01 20:06:58,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +36: [2022-12-01 20:06:58,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +36: [2022-12-01 20:06:58,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt +36: [2022-12-01 20:06:58,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +36: [2022-12-01 20:06:58,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. +36: [2022-12-01 20:06:58,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt +36: [2022-12-01 20:06:58,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +10: [2022-12-01 20:06:58,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt +10: [2022-12-01 20:06:58,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +28: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. +36: [2022-12-01 20:06:58,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. +36: [2022-12-01 20:06:58,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt +36: [2022-12-01 20:06:58,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. +28: [2022-12-01 20:06:58,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt +28: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. +49: [2022-12-01 20:06:58,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt +49: [2022-12-01 20:06:58,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt +49: [2022-12-01 20:06:58,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +49: [2022-12-01 20:06:58,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt +49: [2022-12-01 20:06:58,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +49: [2022-12-01 20:06:58,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt +49: [2022-12-01 20:06:58,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt +49: [2022-12-01 20:06:58,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +49: [2022-12-01 20:06:58,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. +35: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. +35: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt +35: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt +35: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +35: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt +35: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt +35: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt +35: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +35: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +44: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. +62: [2022-12-01 20:06:58,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. +62: [2022-12-01 20:06:58,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. +62: [2022-12-01 20:06:58,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. +10: [2022-12-01 20:06:58,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +44: [2022-12-01 20:06:58,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt +44: [2022-12-01 20:06:58,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +10: [2022-12-01 20:06:58,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +10: [2022-12-01 20:06:58,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +62: [2022-12-01 20:06:58,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. +62: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt +62: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt +62: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt +62: [2022-12-01 20:06:58,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt +62: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +62: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +62: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +62: [2022-12-01 20:06:58,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +62: [2022-12-01 20:06:58,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. +62: [2022-12-01 20:06:58,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt +62: [2022-12-01 20:06:58,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. +59: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt +59: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt +59: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. +59: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +59: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +59: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt +59: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +59: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. +20: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt +20: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt +20: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt +20: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt +20: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +20: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +20: [2022-12-01 20:06:58,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +29: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. +29: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. +29: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. +29: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. +29: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +20: [2022-12-01 20:06:58,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +29: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. +29: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. +29: [2022-12-01 20:06:58,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt +29: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. +29: [2022-12-01 20:06:58,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt +29: [2022-12-01 20:06:58,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt +29: [2022-12-01 20:06:58,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt +29: [2022-12-01 20:06:58,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt +39: [2022-12-01 20:06:58,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt +39: [2022-12-01 20:06:58,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt +29: [2022-12-01 20:06:58,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt +29: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt +29: [2022-12-01 20:06:58,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt +39: [2022-12-01 20:06:58,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt +29: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +29: [2022-12-01 20:06:58,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt +29: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +29: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +29: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt +39: [2022-12-01 20:06:58,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt +29: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +29: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +29: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt +39: [2022-12-01 20:06:58,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +39: [2022-12-01 20:06:58,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +51: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. +36: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. +36: [2022-12-01 20:06:58,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt +36: [2022-12-01 20:06:58,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +51: [2022-12-01 20:06:58,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt +51: [2022-12-01 20:06:58,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. +52: [2022-12-01 20:06:58,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. +52: [2022-12-01 20:06:58,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt +52: [2022-12-01 20:06:58,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +15: [2022-12-01 20:06:58,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt +15: [2022-12-01 20:06:58,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt +15: [2022-12-01 20:06:58,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt +15: [2022-12-01 20:06:58,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +15: [2022-12-01 20:06:58,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +15: [2022-12-01 20:06:58,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt +15: [2022-12-01 20:06:58,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +15: [2022-12-01 20:06:58,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +15: [2022-12-01 20:06:58,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +42: [2022-12-01 20:06:58,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. +42: [2022-12-01 20:06:58,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt +42: [2022-12-01 20:06:58,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +42: [2022-12-01 20:06:58,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt +42: [2022-12-01 20:06:58,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt +42: [2022-12-01 20:06:58,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt +42: [2022-12-01 20:06:58,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. +42: [2022-12-01 20:06:58,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt +42: [2022-12-01 20:06:58,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt +42: [2022-12-01 20:06:58,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +42: [2022-12-01 20:06:58,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. +13: [2022-12-01 20:06:58,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +13: [2022-12-01 20:06:58,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. +13: [2022-12-01 20:06:58,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt +13: [2022-12-01 20:06:58,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +13: [2022-12-01 20:06:58,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +13: [2022-12-01 20:06:58,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +13: [2022-12-01 20:06:58,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt +13: [2022-12-01 20:06:58,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +13: [2022-12-01 20:06:58,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +19: [2022-12-01 20:06:58,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. +19: [2022-12-01 20:06:58,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. +19: [2022-12-01 20:06:58,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 8: [2022-12-01 20:06:58,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +41: [2022-12-01 20:06:58,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. +41: [2022-12-01 20:06:58,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt +41: [2022-12-01 20:06:58,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +41: [2022-12-01 20:06:58,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt +41: [2022-12-01 20:06:58,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +41: [2022-12-01 20:06:58,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +48: [2022-12-01 20:06:58,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. +48: [2022-12-01 20:06:58,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. +48: [2022-12-01 20:06:58,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. +48: [2022-12-01 20:06:58,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. +48: [2022-12-01 20:06:58,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. +48: [2022-12-01 20:06:58,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt +48: [2022-12-01 20:06:58,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. +48: [2022-12-01 20:06:58,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt +48: [2022-12-01 20:06:58,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt +48: [2022-12-01 20:06:58,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt +48: [2022-12-01 20:06:58,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. +48: [2022-12-01 20:06:58,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +48: [2022-12-01 20:06:58,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt +48: [2022-12-01 20:06:58,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +48: [2022-12-01 20:06:58,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +48: [2022-12-01 20:06:58,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +48: [2022-12-01 20:06:58,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt +48: [2022-12-01 20:06:58,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt +48: [2022-12-01 20:06:58,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +48: [2022-12-01 20:06:58,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +48: [2022-12-01 20:06:58,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 8: [2022-12-01 20:06:58,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt + 8: [2022-12-01 20:06:58,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +48: [2022-12-01 20:06:58,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. +48: [2022-12-01 20:06:58,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt +48: [2022-12-01 20:06:58,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +45: [2022-12-01 20:06:58,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. + 8: [2022-12-01 20:06:58,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +45: [2022-12-01 20:06:58,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt +45: [2022-12-01 20:06:58,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 8: [2022-12-01 20:06:58,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt + 8: [2022-12-01 20:06:58,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +28: [2022-12-01 20:06:58,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. +28: [2022-12-01 20:06:58,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt +28: [2022-12-01 20:06:58,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +28: [2022-12-01 20:06:58,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. +10: [2022-12-01 20:06:58,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. + 1: [2022-12-01 20:06:58,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +10: [2022-12-01 20:06:58,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +10: [2022-12-01 20:06:58,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +28: [2022-12-01 20:06:58,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt +28: [2022-12-01 20:06:58,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 1: [2022-12-01 20:06:58,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt + 1: [2022-12-01 20:06:58,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. +17: [2022-12-01 20:06:58,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt +17: [2022-12-01 20:06:58,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt +17: [2022-12-01 20:06:58,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt +17: [2022-12-01 20:06:58,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +17: [2022-12-01 20:06:58,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +17: [2022-12-01 20:06:58,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +17: [2022-12-01 20:06:58,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt +17: [2022-12-01 20:06:58,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 0: [2022-12-01 20:06:58,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt + 0: [2022-12-01 20:06:58,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. +60: [2022-12-01 20:06:58,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt +60: [2022-12-01 20:06:58,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt +60: [2022-12-01 20:06:58,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt +60: [2022-12-01 20:06:58,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt +60: [2022-12-01 20:06:58,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt +60: [2022-12-01 20:06:58,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt +60: [2022-12-01 20:06:58,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +60: [2022-12-01 20:06:58,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. +57: [2022-12-01 20:06:58,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt +57: [2022-12-01 20:06:58,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt +57: [2022-12-01 20:06:58,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt +57: [2022-12-01 20:06:58,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt +57: [2022-12-01 20:06:58,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +57: [2022-12-01 20:06:58,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +57: [2022-12-01 20:06:58,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +57: [2022-12-01 20:06:58,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +60: [2022-12-01 20:06:58,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. +60: [2022-12-01 20:06:58,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt +60: [2022-12-01 20:06:58,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +50: [2022-12-01 20:06:58,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. +56: [2022-12-01 20:06:58,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt +56: [2022-12-01 20:06:58,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt +56: [2022-12-01 20:06:58,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt +56: [2022-12-01 20:06:58,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt +56: [2022-12-01 20:06:58,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt +56: [2022-12-01 20:06:58,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +56: [2022-12-01 20:06:58,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +56: [2022-12-01 20:06:58,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +24: [2022-12-01 20:06:58,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. +24: [2022-12-01 20:06:58,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. +24: [2022-12-01 20:06:58,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. +24: [2022-12-01 20:06:58,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. +24: [2022-12-01 20:06:58,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. +24: [2022-12-01 20:06:58,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. +24: [2022-12-01 20:06:58,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt +24: [2022-12-01 20:06:58,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt +24: [2022-12-01 20:06:58,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt +24: [2022-12-01 20:06:58,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt +24: [2022-12-01 20:06:58,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt +24: [2022-12-01 20:06:58,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt +24: [2022-12-01 20:06:58,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +24: [2022-12-01 20:06:58,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +24: [2022-12-01 20:06:58,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +24: [2022-12-01 20:06:58,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +24: [2022-12-01 20:06:58,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +24: [2022-12-01 20:06:58,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +50: [2022-12-01 20:06:58,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt +50: [2022-12-01 20:06:58,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +53: [2022-12-01 20:06:58,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. +53: [2022-12-01 20:06:58,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt +53: [2022-12-01 20:06:58,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +53: [2022-12-01 20:06:58,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. +53: [2022-12-01 20:06:58,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt +53: [2022-12-01 20:06:58,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +24: [2022-12-01 20:06:58,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. +24: [2022-12-01 20:06:58,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt +24: [2022-12-01 20:06:58,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. +12: [2022-12-01 20:06:58,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. +12: [2022-12-01 20:06:58,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt +12: [2022-12-01 20:06:58,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. +43: [2022-12-01 20:06:58,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt +43: [2022-12-01 20:06:58,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt +43: [2022-12-01 20:06:58,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt +43: [2022-12-01 20:06:58,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +43: [2022-12-01 20:06:58,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +43: [2022-12-01 20:06:58,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +43: [2022-12-01 20:06:58,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. +43: [2022-12-01 20:06:58,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. +43: [2022-12-01 20:06:58,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt +43: [2022-12-01 20:06:58,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt +43: [2022-12-01 20:06:58,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +43: [2022-12-01 20:06:58,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +17: [2022-12-01 20:06:58,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. +17: [2022-12-01 20:06:58,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt +17: [2022-12-01 20:06:58,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +17: [2022-12-01 20:06:58,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. +17: [2022-12-01 20:06:58,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt +17: [2022-12-01 20:06:58,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 8: [2022-12-01 20:06:58,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. + 8: [2022-12-01 20:06:58,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt + 8: [2022-12-01 20:06:58,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 1: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. + 1: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt + 1: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. +26: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt +26: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt +26: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt +26: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt +41: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. +26: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt +26: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +26: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +26: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +41: [2022-12-01 20:06:58,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +41: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +26: [2022-12-01 20:06:58,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. + 4: [2022-12-01 20:06:58,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt + 4: [2022-12-01 20:06:58,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt + 4: [2022-12-01 20:06:58,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt + 4: [2022-12-01 20:06:58,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt + 4: [2022-12-01 20:06:58,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. + 4: [2022-12-01 20:06:58,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 4: [2022-12-01 20:06:58,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 4: [2022-12-01 20:06:58,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 4: [2022-12-01 20:06:58,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +41: [2022-12-01 20:06:58,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. +41: [2022-12-01 20:06:58,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt +41: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. +52: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. +52: [2022-12-01 20:06:58,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt +52: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. +34: [2022-12-01 20:06:58,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt +34: [2022-12-01 20:06:58,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt +34: [2022-12-01 20:06:58,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt +34: [2022-12-01 20:06:58,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt +34: [2022-12-01 20:06:58,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt +34: [2022-12-01 20:06:58,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +34: [2022-12-01 20:06:58,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +34: [2022-12-01 20:06:58,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt +34: [2022-12-01 20:06:58,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. + 5: [2022-12-01 20:06:58,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt + 5: [2022-12-01 20:06:58,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt + 5: [2022-12-01 20:06:58,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt + 5: [2022-12-01 20:06:58,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt + 5: [2022-12-01 20:06:58,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt + 5: [2022-12-01 20:06:58,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt + 5: [2022-12-01 20:06:58,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt + 5: [2022-12-01 20:06:58,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 5: [2022-12-01 20:06:58,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. +37: [2022-12-01 20:06:58,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt +37: [2022-12-01 20:06:58,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +37: [2022-12-01 20:06:58,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt +37: [2022-12-01 20:06:58,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt +37: [2022-12-01 20:06:58,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt +37: [2022-12-01 20:06:58,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt +37: [2022-12-01 20:06:58,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt +37: [2022-12-01 20:06:58,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +37: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +41: [2022-12-01 20:06:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. +41: [2022-12-01 20:06:58,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt +41: [2022-12-01 20:06:58,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 7: [2022-12-01 20:06:58,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. + 7: [2022-12-01 20:06:58,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt + 7: [2022-12-01 20:06:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +62: [2022-12-01 20:06:58,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. +62: [2022-12-01 20:06:58,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. +62: [2022-12-01 20:06:58,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt +62: [2022-12-01 20:06:58,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt +62: [2022-12-01 20:06:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +62: [2022-12-01 20:06:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +52: [2022-12-01 20:06:58,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. +52: [2022-12-01 20:06:58,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt +52: [2022-12-01 20:06:58,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +59: [2022-12-01 20:06:58,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. +59: [2022-12-01 20:06:58,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt +59: [2022-12-01 20:06:58,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. + 2: [2022-12-01 20:06:58,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt + 2: [2022-12-01 20:06:58,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt + 2: [2022-12-01 20:06:58,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt + 2: [2022-12-01 20:06:58,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt + 2: [2022-12-01 20:06:58,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt + 2: [2022-12-01 20:06:58,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 2: [2022-12-01 20:06:58,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt +21: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt +21: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +21: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +21: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt +21: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt +21: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +21: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt + 6: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt + 6: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt + 6: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt + 6: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt + 6: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt + 6: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt + 6: [2022-12-01 20:06:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 6: [2022-12-01 20:06:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +52: [2022-12-01 20:06:58,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. +52: [2022-12-01 20:06:58,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt +52: [2022-12-01 20:06:58,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +63: [2022-12-01 20:06:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. +63: [2022-12-01 20:06:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. +63: [2022-12-01 20:06:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. +63: [2022-12-01 20:06:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. +63: [2022-12-01 20:06:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. +63: [2022-12-01 20:06:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. +63: [2022-12-01 20:06:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. +63: [2022-12-01 20:06:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. +63: [2022-12-01 20:06:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt +63: [2022-12-01 20:06:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt +63: [2022-12-01 20:06:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt +63: [2022-12-01 20:06:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt +63: [2022-12-01 20:06:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt +63: [2022-12-01 20:06:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt +63: [2022-12-01 20:06:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt +63: [2022-12-01 20:06:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt +63: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +63: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +63: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +63: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +63: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +63: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +63: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +63: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. +38: [2022-12-01 20:06:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt +38: [2022-12-01 20:06:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt +38: [2022-12-01 20:06:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. +38: [2022-12-01 20:06:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +38: [2022-12-01 20:06:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt +38: [2022-12-01 20:06:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt +38: [2022-12-01 20:06:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt +38: [2022-12-01 20:06:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +38: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 9: [2022-12-01 20:06:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. + 9: [2022-12-01 20:06:58,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt + 9: [2022-12-01 20:06:58,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. +54: [2022-12-01 20:06:58,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. +54: [2022-12-01 20:06:58,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt +54: [2022-12-01 20:06:58,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt +54: [2022-12-01 20:06:58,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +54: [2022-12-01 20:06:58,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt +54: [2022-12-01 20:06:58,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +54: [2022-12-01 20:06:58,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. +47: [2022-12-01 20:06:58,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt +47: [2022-12-01 20:06:58,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt +47: [2022-12-01 20:06:58,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt +47: [2022-12-01 20:06:58,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt +47: [2022-12-01 20:06:58,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt +47: [2022-12-01 20:06:58,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. +47: [2022-12-01 20:06:58,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt +47: [2022-12-01 20:06:58,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +47: [2022-12-01 20:06:58,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +18: [2022-12-01 20:06:58,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. +18: [2022-12-01 20:06:58,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. +18: [2022-12-01 20:06:58,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. +18: [2022-12-01 20:06:58,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. +18: [2022-12-01 20:06:58,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. +18: [2022-12-01 20:06:58,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. +18: [2022-12-01 20:06:58,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. +18: [2022-12-01 20:06:58,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. +18: [2022-12-01 20:06:58,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt +18: [2022-12-01 20:06:58,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt +18: [2022-12-01 20:06:58,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt +18: [2022-12-01 20:06:58,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt +18: [2022-12-01 20:06:58,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt +18: [2022-12-01 20:06:58,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt +18: [2022-12-01 20:06:58,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +18: [2022-12-01 20:06:58,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt +18: [2022-12-01 20:06:58,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +18: [2022-12-01 20:06:58,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt +18: [2022-12-01 20:06:58,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +18: [2022-12-01 20:06:58,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +18: [2022-12-01 20:06:58,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +18: [2022-12-01 20:06:58,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +18: [2022-12-01 20:06:58,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +18: [2022-12-01 20:06:58,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +62: [2022-12-01 20:06:58,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. +62: [2022-12-01 20:06:58,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt +62: [2022-12-01 20:06:58,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +10: [2022-12-01 20:06:58,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. +10: [2022-12-01 20:06:58,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt +10: [2022-12-01 20:06:58,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 9: [2022-12-01 20:06:58,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. + 9: [2022-12-01 20:06:58,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt + 9: [2022-12-01 20:06:58,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +10: [2022-12-01 20:06:58,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +10: [2022-12-01 20:06:58,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +10: [2022-12-01 20:06:58,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. +25: [2022-12-01 20:06:58,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. +25: [2022-12-01 20:06:58,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt +25: [2022-12-01 20:06:58,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +25: [2022-12-01 20:06:58,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt +25: [2022-12-01 20:06:58,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +25: [2022-12-01 20:06:58,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt +25: [2022-12-01 20:06:58,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt +25: [2022-12-01 20:06:58,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +25: [2022-12-01 20:06:58,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +44: [2022-12-01 20:06:58,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. +44: [2022-12-01 20:06:58,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt +44: [2022-12-01 20:06:58,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 9: [2022-12-01 20:06:58,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. + 9: [2022-12-01 20:06:58,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt + 9: [2022-12-01 20:06:58,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +44: [2022-12-01 20:06:58,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. +44: [2022-12-01 20:06:58,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt +44: [2022-12-01 20:06:58,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +11: [2022-12-01 20:06:58,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. +11: [2022-12-01 20:06:58,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt +11: [2022-12-01 20:06:58,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +20: [2022-12-01 20:06:58,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. +20: [2022-12-01 20:06:58,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt +20: [2022-12-01 20:06:58,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 2: [2022-12-01 20:06:58,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. + 2: [2022-12-01 20:06:58,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt + 2: [2022-12-01 20:06:58,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +58: [2022-12-01 20:06:58,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. +58: [2022-12-01 20:06:58,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt +58: [2022-12-01 20:06:58,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +44: [2022-12-01 20:06:58,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. +44: [2022-12-01 20:06:58,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt +44: [2022-12-01 20:06:58,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. +31: [2022-12-01 20:06:58,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. +31: [2022-12-01 20:06:58,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt +31: [2022-12-01 20:06:58,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt +31: [2022-12-01 20:06:58,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt +31: [2022-12-01 20:06:58,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt +31: [2022-12-01 20:06:58,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt +31: [2022-12-01 20:06:58,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +31: [2022-12-01 20:06:58,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +31: [2022-12-01 20:06:58,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. +23: [2022-12-01 20:06:58,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. +23: [2022-12-01 20:06:58,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt +23: [2022-12-01 20:06:58,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. +23: [2022-12-01 20:06:58,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +23: [2022-12-01 20:06:58,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt +23: [2022-12-01 20:06:58,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt +23: [2022-12-01 20:06:58,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt +23: [2022-12-01 20:06:58,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +23: [2022-12-01 20:06:58,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +44: [2022-12-01 20:06:58,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. +44: [2022-12-01 20:06:58,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt +44: [2022-12-01 20:06:58,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +11: [2022-12-01 20:06:58,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +11: [2022-12-01 20:06:58,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +11: [2022-12-01 20:06:58,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 2: [2022-12-01 20:06:58,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. + 2: [2022-12-01 20:06:58,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt + 2: [2022-12-01 20:06:58,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +56: [2022-12-01 20:06:58,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. +56: [2022-12-01 20:06:58,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt +56: [2022-12-01 20:06:58,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +19: [2022-12-01 20:06:58,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. +19: [2022-12-01 20:06:58,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt +19: [2022-12-01 20:06:58,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. +16: [2022-12-01 20:06:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt +16: [2022-12-01 20:06:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +16: [2022-12-01 20:06:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt +16: [2022-12-01 20:06:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +16: [2022-12-01 20:06:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. +16: [2022-12-01 20:06:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +16: [2022-12-01 20:06:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. +16: [2022-12-01 20:06:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt +16: [2022-12-01 20:06:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +43: [2022-12-01 20:06:58,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. +43: [2022-12-01 20:06:58,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt +43: [2022-12-01 20:06:58,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +54: [2022-12-01 20:06:58,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. +54: [2022-12-01 20:06:58,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt +54: [2022-12-01 20:06:58,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +19: [2022-12-01 20:06:58,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. +19: [2022-12-01 20:06:58,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt +19: [2022-12-01 20:06:58,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +19: [2022-12-01 20:06:58,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. +19: [2022-12-01 20:06:58,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt +19: [2022-12-01 20:06:58,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +10: [2022-12-01 20:06:58,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. +10: [2022-12-01 20:06:58,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt +10: [2022-12-01 20:06:58,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +11: [2022-12-01 20:06:58,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +11: [2022-12-01 20:06:58,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +11: [2022-12-01 20:06:58,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +28: [2022-12-01 20:06:58,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. +28: [2022-12-01 20:06:58,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt +28: [2022-12-01 20:06:58,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +54: [2022-12-01 20:06:59,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. +54: [2022-12-01 20:06:59,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt +54: [2022-12-01 20:06:59,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +57: [2022-12-01 20:06:59,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. +57: [2022-12-01 20:06:59,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt +57: [2022-12-01 20:06:59,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +46: [2022-12-01 20:06:59,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. +46: [2022-12-01 20:06:59,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt +46: [2022-12-01 20:06:59,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +12: [2022-12-01 20:06:59,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +12: [2022-12-01 20:06:59,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +12: [2022-12-01 20:06:59,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +12: [2022-12-01 20:06:59,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. +12: [2022-12-01 20:06:59,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt +12: [2022-12-01 20:06:59,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +45: [2022-12-01 20:06:59,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. +45: [2022-12-01 20:06:59,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt +45: [2022-12-01 20:06:59,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 1: [2022-12-01 20:06:59,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. + 1: [2022-12-01 20:06:59,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt + 1: [2022-12-01 20:06:59,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +14: [2022-12-01 20:06:59,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. +14: [2022-12-01 20:06:59,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt +14: [2022-12-01 20:06:59,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +58: [2022-12-01 20:06:59,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. +58: [2022-12-01 20:06:59,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt +58: [2022-12-01 20:06:59,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +24: [2022-12-01 20:06:59,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. +24: [2022-12-01 20:06:59,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step1000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt +24: [2022-12-01 20:06:59,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! + 0: successfully saved checkpoint at iteration 1000 to checkpoints_8b7beta +63: time (ms) | save-checkpoint: 8497.55 +63: iteration 1010/ 5494 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 6.75 | learning rate: 1.867E-04 | global batch size: 1024 | lm loss: 2.922859E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 151.659 | TFLOPs: 33.91 | +63: iteration 1020/ 5494 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 5.54 | learning rate: 1.864E-04 | global batch size: 1024 | lm loss: 2.922768E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.820 | TFLOPs: 41.32 | +63: iteration 1030/ 5494 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 5.50 | learning rate: 1.861E-04 | global batch size: 1024 | lm loss: 2.892291E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.291 | TFLOPs: 41.65 | +63: iteration 1040/ 5494 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 5.53 | learning rate: 1.858E-04 | global batch size: 1024 | lm loss: 2.892736E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.063 | TFLOPs: 41.37 | +63: iteration 1050/ 5494 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 5.69 | learning rate: 1.855E-04 | global batch size: 1024 | lm loss: 2.880710E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.043 | TFLOPs: 40.25 | +63: iteration 1060/ 5494 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 5.92 | learning rate: 1.853E-04 | global batch size: 1024 | lm loss: 2.902648E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.904 | TFLOPs: 38.66 | +63: iteration 1070/ 5494 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 5.38 | learning rate: 1.850E-04 | global batch size: 1024 | lm loss: 2.862010E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.260 | TFLOPs: 42.54 | +63: iteration 1080/ 5494 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 5.92 | learning rate: 1.847E-04 | global batch size: 1024 | lm loss: 2.858587E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.947 | TFLOPs: 38.67 | +63: iteration 1090/ 5494 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 5.50 | learning rate: 1.844E-04 | global batch size: 1024 | lm loss: 2.848578E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.106 | TFLOPs: 41.61 | +63: iteration 1100/ 5494 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 5.95 | learning rate: 1.841E-04 | global batch size: 1024 | lm loss: 2.868756E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.242 | TFLOPs: 38.51 | +63: iteration 1110/ 5494 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 5.78 | learning rate: 1.838E-04 | global batch size: 1024 | lm loss: 2.836075E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.041 | TFLOPs: 39.58 | +63: iteration 1120/ 5494 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 5.52 | learning rate: 1.835E-04 | global batch size: 1024 | lm loss: 2.849834E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.354 | TFLOPs: 41.44 | +63: iteration 1130/ 5494 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 5.69 | learning rate: 1.832E-04 | global batch size: 1024 | lm loss: 2.809186E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.936 | TFLOPs: 40.23 | +63: iteration 1140/ 5494 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 5.79 | learning rate: 1.829E-04 | global batch size: 1024 | lm loss: 2.812944E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.966 | TFLOPs: 39.56 | +63: iteration 1150/ 5494 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 6.15 | learning rate: 1.826E-04 | global batch size: 1024 | lm loss: 2.805520E+00 | grad norm: 0.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.410 | TFLOPs: 37.20 | +63: iteration 1160/ 5494 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 5.54 | learning rate: 1.823E-04 | global batch size: 1024 | lm loss: 2.849624E+00 | grad norm: 0.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.810 | TFLOPs: 41.32 | +63: iteration 1170/ 5494 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 6.07 | learning rate: 1.820E-04 | global batch size: 1024 | lm loss: 2.789980E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.837 | TFLOPs: 37.75 | +63: iteration 1180/ 5494 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 5.63 | learning rate: 1.817E-04 | global batch size: 1024 | lm loss: 2.778788E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.892 | TFLOPs: 40.67 | +63: iteration 1190/ 5494 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 5.66 | learning rate: 1.813E-04 | global batch size: 1024 | lm loss: 2.755759E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.767 | TFLOPs: 40.41 | +63: iteration 1200/ 5494 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 5.91 | learning rate: 1.810E-04 | global batch size: 1024 | lm loss: 2.779517E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.307 | TFLOPs: 38.75 | +63: iteration 1210/ 5494 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 5.65 | learning rate: 1.807E-04 | global batch size: 1024 | lm loss: 2.751804E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.254 | TFLOPs: 40.52 | +63: iteration 1220/ 5494 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 5.40 | learning rate: 1.804E-04 | global batch size: 1024 | lm loss: 2.768176E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.464 | TFLOPs: 42.36 | +63: iteration 1230/ 5494 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 6.04 | learning rate: 1.801E-04 | global batch size: 1024 | lm loss: 2.758223E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.490 | TFLOPs: 37.89 | +63: iteration 1240/ 5494 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 5.89 | learning rate: 1.797E-04 | global batch size: 1024 | lm loss: 2.754824E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.927 | TFLOPs: 38.88 | +63: iteration 1250/ 5494 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 5.52 | learning rate: 1.794E-04 | global batch size: 1024 | lm loss: 2.725366E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.455 | TFLOPs: 41.46 | +63: iteration 1260/ 5494 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 5.67 | learning rate: 1.791E-04 | global batch size: 1024 | lm loss: 2.706145E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.502 | TFLOPs: 40.35 | +63: iteration 1270/ 5494 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 5.64 | learning rate: 1.787E-04 | global batch size: 1024 | lm loss: 2.703922E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.635 | TFLOPs: 40.61 | +63: iteration 1280/ 5494 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 5.52 | learning rate: 1.784E-04 | global batch size: 1024 | lm loss: 2.723801E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.456 | TFLOPs: 41.46 | +63: iteration 1290/ 5494 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 5.80 | learning rate: 1.781E-04 | global batch size: 1024 | lm loss: 2.735729E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.640 | TFLOPs: 39.49 | +63: iteration 1300/ 5494 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 6.15 | learning rate: 1.777E-04 | global batch size: 1024 | lm loss: 2.738804E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.626 | TFLOPs: 37.25 | +63: iteration 1310/ 5494 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 5.61 | learning rate: 1.774E-04 | global batch size: 1024 | lm loss: 2.697970E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.509 | TFLOPs: 40.80 | +63: iteration 1320/ 5494 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 5.83 | learning rate: 1.770E-04 | global batch size: 1024 | lm loss: 2.685928E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.626 | TFLOPs: 39.26 | +63: iteration 1330/ 5494 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 6.13 | learning rate: 1.767E-04 | global batch size: 1024 | lm loss: 2.676560E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.932 | TFLOPs: 37.32 | +63: iteration 1340/ 5494 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 5.70 | learning rate: 1.763E-04 | global batch size: 1024 | lm loss: 2.682208E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.658 | TFLOPs: 40.17 | +63: iteration 1350/ 5494 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 5.71 | learning rate: 1.760E-04 | global batch size: 1024 | lm loss: 2.665909E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.367 | TFLOPs: 40.10 | +63: iteration 1360/ 5494 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 5.63 | learning rate: 1.756E-04 | global batch size: 1024 | lm loss: 2.674405E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.903 | TFLOPs: 40.67 | +63: iteration 1370/ 5494 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 5.89 | learning rate: 1.753E-04 | global batch size: 1024 | lm loss: 2.678618E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.784 | TFLOPs: 38.85 | +63: iteration 1380/ 5494 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 6.17 | learning rate: 1.749E-04 | global batch size: 1024 | lm loss: 2.670544E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 165.945 | TFLOPs: 37.10 | +63: iteration 1390/ 5494 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 5.78 | learning rate: 1.745E-04 | global batch size: 1024 | lm loss: 2.668024E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.087 | TFLOPs: 39.59 | +63: iteration 1400/ 5494 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 5.80 | learning rate: 1.742E-04 | global batch size: 1024 | lm loss: 2.656924E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.590 | TFLOPs: 39.48 | +63: iteration 1410/ 5494 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 5.68 | learning rate: 1.738E-04 | global batch size: 1024 | lm loss: 2.653225E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.187 | TFLOPs: 40.28 | +63: iteration 1420/ 5494 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 5.40 | learning rate: 1.734E-04 | global batch size: 1024 | lm loss: 2.654748E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.470 | TFLOPs: 42.36 | +63: iteration 1430/ 5494 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 5.85 | learning rate: 1.731E-04 | global batch size: 1024 | lm loss: 2.649160E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.894 | TFLOPs: 39.10 | +63: iteration 1440/ 5494 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 5.55 | learning rate: 1.727E-04 | global batch size: 1024 | lm loss: 2.625255E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.361 | TFLOPs: 41.22 | +63: iteration 1450/ 5494 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 5.40 | learning rate: 1.723E-04 | global batch size: 1024 | lm loss: 2.628920E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.571 | TFLOPs: 42.38 | +63: iteration 1460/ 5494 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 5.78 | learning rate: 1.720E-04 | global batch size: 1024 | lm loss: 2.625284E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.166 | TFLOPs: 39.61 | +63: iteration 1470/ 5494 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 5.93 | learning rate: 1.716E-04 | global batch size: 1024 | lm loss: 2.638582E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.536 | TFLOPs: 38.57 | +63: iteration 1480/ 5494 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 5.68 | learning rate: 1.712E-04 | global batch size: 1024 | lm loss: 2.606926E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.337 | TFLOPs: 40.32 | +63: iteration 1490/ 5494 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 5.68 | learning rate: 1.708E-04 | global batch size: 1024 | lm loss: 2.612521E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.215 | TFLOPs: 40.29 | +63: iteration 1500/ 5494 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 5.52 | learning rate: 1.704E-04 | global batch size: 1024 | lm loss: 2.626688E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.433 | TFLOPs: 41.46 | +63: iteration 1510/ 5494 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 5.82 | learning rate: 1.700E-04 | global batch size: 1024 | lm loss: 2.615223E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.919 | TFLOPs: 39.33 | +63: iteration 1520/ 5494 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 5.93 | learning rate: 1.697E-04 | global batch size: 1024 | lm loss: 2.600809E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.661 | TFLOPs: 38.60 | +63: iteration 1530/ 5494 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 5.66 | learning rate: 1.693E-04 | global batch size: 1024 | lm loss: 2.583979E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.066 | TFLOPs: 40.48 | +63: iteration 1540/ 5494 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 5.53 | learning rate: 1.689E-04 | global batch size: 1024 | lm loss: 2.587565E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.338 | TFLOPs: 41.44 | +63: iteration 1550/ 5494 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 6.07 | learning rate: 1.685E-04 | global batch size: 1024 | lm loss: 2.584255E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.808 | TFLOPs: 37.74 | +63: iteration 1560/ 5494 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 5.64 | learning rate: 1.681E-04 | global batch size: 1024 | lm loss: 2.607948E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.452 | TFLOPs: 40.57 | +63: iteration 1570/ 5494 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 5.68 | learning rate: 1.677E-04 | global batch size: 1024 | lm loss: 2.576628E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.177 | TFLOPs: 40.28 | +63: iteration 1580/ 5494 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 5.52 | learning rate: 1.673E-04 | global batch size: 1024 | lm loss: 2.563298E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.631 | TFLOPs: 41.50 | +63: iteration 1590/ 5494 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 5.40 | learning rate: 1.669E-04 | global batch size: 1024 | lm loss: 2.569667E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.756 | TFLOPs: 42.42 | +63: iteration 1600/ 5494 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 5.81 | learning rate: 1.665E-04 | global batch size: 1024 | lm loss: 2.563815E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.318 | TFLOPs: 39.42 | +63: iteration 1610/ 5494 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 5.76 | learning rate: 1.661E-04 | global batch size: 1024 | lm loss: 2.560911E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.654 | TFLOPs: 39.72 | +63: iteration 1620/ 5494 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 5.37 | learning rate: 1.657E-04 | global batch size: 1024 | lm loss: 2.545437E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.608 | TFLOPs: 42.61 | +63: iteration 1630/ 5494 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 5.75 | learning rate: 1.653E-04 | global batch size: 1024 | lm loss: 2.548339E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.018 | TFLOPs: 39.80 | +63: iteration 1640/ 5494 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 6.16 | learning rate: 1.648E-04 | global batch size: 1024 | lm loss: 2.546495E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.213 | TFLOPs: 37.16 | +63: iteration 1650/ 5494 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 5.66 | learning rate: 1.644E-04 | global batch size: 1024 | lm loss: 2.550083E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.766 | TFLOPs: 40.41 | +63: iteration 1660/ 5494 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 5.62 | learning rate: 1.640E-04 | global batch size: 1024 | lm loss: 2.534601E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.149 | TFLOPs: 40.72 | +63: iteration 1670/ 5494 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 5.51 | learning rate: 1.636E-04 | global batch size: 1024 | lm loss: 2.540636E+00 | grad norm: 1.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.842 | TFLOPs: 41.55 | +63: iteration 1680/ 5494 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 5.65 | learning rate: 1.632E-04 | global batch size: 1024 | lm loss: 2.585595E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.134 | TFLOPs: 40.50 | +63: iteration 1690/ 5494 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 5.69 | learning rate: 1.628E-04 | global batch size: 1024 | lm loss: 2.551530E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.929 | TFLOPs: 40.23 | +63: iteration 1700/ 5494 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 5.63 | learning rate: 1.623E-04 | global batch size: 1024 | lm loss: 2.525348E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.756 | TFLOPs: 40.63 | +63: iteration 1710/ 5494 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 5.78 | learning rate: 1.619E-04 | global batch size: 1024 | lm loss: 2.521863E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.197 | TFLOPs: 39.62 | +63: iteration 1720/ 5494 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 5.82 | learning rate: 1.615E-04 | global batch size: 1024 | lm loss: 2.525274E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.961 | TFLOPs: 39.34 | +63: iteration 1730/ 5494 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 5.90 | learning rate: 1.611E-04 | global batch size: 1024 | lm loss: 2.493471E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.642 | TFLOPs: 38.82 | +63: iteration 1740/ 5494 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 5.52 | learning rate: 1.606E-04 | global batch size: 1024 | lm loss: 2.518942E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.599 | TFLOPs: 41.49 | +63: iteration 1750/ 5494 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 5.67 | learning rate: 1.602E-04 | global batch size: 1024 | lm loss: 2.512356E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.528 | TFLOPs: 40.36 | +63: iteration 1760/ 5494 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 5.86 | learning rate: 1.598E-04 | global batch size: 1024 | lm loss: 2.522733E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.840 | TFLOPs: 39.09 | +63: iteration 1770/ 5494 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 6.14 | learning rate: 1.593E-04 | global batch size: 1024 | lm loss: 2.511637E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.669 | TFLOPs: 37.26 | +63: iteration 1780/ 5494 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 5.78 | learning rate: 1.589E-04 | global batch size: 1024 | lm loss: 2.506890E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.079 | TFLOPs: 39.59 | +63: iteration 1790/ 5494 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 5.50 | learning rate: 1.585E-04 | global batch size: 1024 | lm loss: 2.536132E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.087 | TFLOPs: 41.60 | +63: iteration 1800/ 5494 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 5.75 | learning rate: 1.580E-04 | global batch size: 1024 | lm loss: 2.510224E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.119 | TFLOPs: 39.82 | +63: iteration 1810/ 5494 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 5.38 | learning rate: 1.576E-04 | global batch size: 1024 | lm loss: 2.503144E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.392 | TFLOPs: 42.57 | +63: iteration 1820/ 5494 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 5.52 | learning rate: 1.571E-04 | global batch size: 1024 | lm loss: 2.503102E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.543 | TFLOPs: 41.48 | +63: iteration 1830/ 5494 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 5.73 | learning rate: 1.567E-04 | global batch size: 1024 | lm loss: 2.499267E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.595 | TFLOPs: 39.93 | +63: iteration 1840/ 5494 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 5.77 | learning rate: 1.563E-04 | global batch size: 1024 | lm loss: 2.495012E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.458 | TFLOPs: 39.67 | +63: iteration 1850/ 5494 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 5.40 | learning rate: 1.558E-04 | global batch size: 1024 | lm loss: 2.481451E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.760 | TFLOPs: 42.42 | +63: iteration 1860/ 5494 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 5.65 | learning rate: 1.554E-04 | global batch size: 1024 | lm loss: 2.471438E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.230 | TFLOPs: 40.52 | +63: iteration 1870/ 5494 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 5.92 | learning rate: 1.549E-04 | global batch size: 1024 | lm loss: 2.473380E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.056 | TFLOPs: 38.69 | +63: iteration 1880/ 5494 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 5.98 | learning rate: 1.545E-04 | global batch size: 1024 | lm loss: 2.481865E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.269 | TFLOPs: 38.29 | +63: iteration 1890/ 5494 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 5.90 | learning rate: 1.540E-04 | global batch size: 1024 | lm loss: 2.477287E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.635 | TFLOPs: 38.82 | +63: iteration 1900/ 5494 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 6.15 | learning rate: 1.536E-04 | global batch size: 1024 | lm loss: 2.479792E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.543 | TFLOPs: 37.23 | +63: iteration 1910/ 5494 | consumed samples: 1955840 | consumed tokens: 4005560320 | elapsed time per iteration (s): 5.66 | learning rate: 1.531E-04 | global batch size: 1024 | lm loss: 2.469997E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.760 | TFLOPs: 40.41 | +63: iteration 1920/ 5494 | consumed samples: 1966080 | consumed tokens: 4026531840 | elapsed time per iteration (s): 5.92 | learning rate: 1.526E-04 | global batch size: 1024 | lm loss: 2.457316E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.869 | TFLOPs: 38.65 | +63: iteration 1930/ 5494 | consumed samples: 1976320 | consumed tokens: 4047503360 | elapsed time per iteration (s): 5.77 | learning rate: 1.522E-04 | global batch size: 1024 | lm loss: 2.468259E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.359 | TFLOPs: 39.65 | +63: iteration 1940/ 5494 | consumed samples: 1986560 | consumed tokens: 4068474880 | elapsed time per iteration (s): 5.70 | learning rate: 1.517E-04 | global batch size: 1024 | lm loss: 2.480016E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.567 | TFLOPs: 40.15 | +63: iteration 1950/ 5494 | consumed samples: 1996800 | consumed tokens: 4089446400 | elapsed time per iteration (s): 5.51 | learning rate: 1.513E-04 | global batch size: 1024 | lm loss: 2.465201E+00 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.690 | TFLOPs: 41.51 | +63: iteration 1960/ 5494 | consumed samples: 2007040 | consumed tokens: 4110417920 | elapsed time per iteration (s): 5.51 | learning rate: 1.508E-04 | global batch size: 1024 | lm loss: 2.452390E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.708 | TFLOPs: 41.52 | +63: iteration 1970/ 5494 | consumed samples: 2017280 | consumed tokens: 4131389440 | elapsed time per iteration (s): 5.71 | learning rate: 1.503E-04 | global batch size: 1024 | lm loss: 2.460163E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.469 | TFLOPs: 40.12 | +63: iteration 1980/ 5494 | consumed samples: 2027520 | consumed tokens: 4152360960 | elapsed time per iteration (s): 5.55 | learning rate: 1.499E-04 | global batch size: 1024 | lm loss: 2.450319E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.357 | TFLOPs: 41.22 | +63: iteration 1990/ 5494 | consumed samples: 2037760 | consumed tokens: 4173332480 | elapsed time per iteration (s): 5.83 | learning rate: 1.494E-04 | global batch size: 1024 | lm loss: 2.453678E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.708 | TFLOPs: 39.28 | + 0: [2022-12-01 21:42:19,394] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00014893268673031913, 0.00014893268673031913, 0.00014893268673031913], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +63: iteration 2000/ 5494 | consumed samples: 2048000 | consumed tokens: 4194304000 | elapsed time per iteration (s): 5.56 | learning rate: 1.489E-04 | global batch size: 1024 | lm loss: 2.456314E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.304 | TFLOPs: 41.20 | + 0: steps: 2000 loss: 2.4764 iter time (s): 5.840 samples/sec: 175.337 +63: ------------------------------------------------------------------------------------------ +63: valid loss at iteration 2000 | lm loss value: 2.456982E+00 | lm loss PPL: 1.166954E+01 | +63: ------------------------------------------------------------------------------------------ + 0: saving checkpoint at iteration 2000 to checkpoints_8b7beta + 0: [2022-12-01 21:42:21,932] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! +32: [2022-12-01 21:42:21,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_24-model_01-model_states.pt... +32: [2022-12-01 21:42:21,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_24-model_00-model_states.pt... + 0: [2022-12-01 21:42:21,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_01-model_00-model_states.pt... + 0: [2022-12-01 21:42:21,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_01-model_01-model_states.pt... +32: [2022-12-01 21:42:22,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_24-model_01-model_states.pt. +32: [2022-12-01 21:42:22,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_24-model_00-model_states.pt. +32: [2022-12-01 21:42:22,393] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_25-model_01-model_states.pt... +32: [2022-12-01 21:42:22,393] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_25-model_00-model_states.pt... + 0: [2022-12-01 21:42:22,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_01-model_01-model_states.pt. + 0: [2022-12-01 21:42:22,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_01-model_00-model_states.pt. + 0: [2022-12-01 21:42:22,404] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_03-model_01-model_states.pt... + 0: [2022-12-01 21:42:22,404] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_03-model_00-model_states.pt... +32: [2022-12-01 21:42:22,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_25-model_00-model_states.pt. +32: [2022-12-01 21:42:22,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_25-model_01-model_states.pt. +32: [2022-12-01 21:42:22,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_26-model_00-model_states.pt... +32: [2022-12-01 21:42:22,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_26-model_01-model_states.pt... + 0: [2022-12-01 21:42:22,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_03-model_01-model_states.pt. + 0: [2022-12-01 21:42:22,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_04-model_01-model_states.pt... + 0: [2022-12-01 21:42:22,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_03-model_00-model_states.pt. + 0: [2022-12-01 21:42:22,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_04-model_00-model_states.pt... +32: [2022-12-01 21:42:22,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_26-model_01-model_states.pt. +32: [2022-12-01 21:42:22,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_27-model_01-model_states.pt... +32: [2022-12-01 21:42:22,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_26-model_00-model_states.pt. +32: [2022-12-01 21:42:22,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_27-model_00-model_states.pt... + 0: [2022-12-01 21:42:22,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_04-model_00-model_states.pt. + 0: [2022-12-01 21:42:22,919] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_05-model_00-model_states.pt... + 0: [2022-12-01 21:42:22,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_04-model_01-model_states.pt. + 0: [2022-12-01 21:42:22,935] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_05-model_01-model_states.pt... + 0: [2022-12-01 21:42:23,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_05-model_01-model_states.pt. + 0: [2022-12-01 21:42:23,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_06-model_01-model_states.pt... + 0: [2022-12-01 21:42:23,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_05-model_00-model_states.pt. + 0: [2022-12-01 21:42:23,192] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_06-model_00-model_states.pt... +32: [2022-12-01 21:42:23,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_27-model_00-model_states.pt. +32: [2022-12-01 21:42:23,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_27-model_01-model_states.pt. +32: [2022-12-01 21:42:23,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_28-model_00-model_states.pt... +32: [2022-12-01 21:42:23,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_28-model_01-model_states.pt... +32: [2022-12-01 21:42:23,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_28-model_01-model_states.pt. +32: [2022-12-01 21:42:23,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_29-model_01-model_states.pt... + 0: [2022-12-01 21:42:23,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_06-model_00-model_states.pt. + 0: [2022-12-01 21:42:23,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_07-model_00-model_states.pt... +32: [2022-12-01 21:42:23,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_28-model_00-model_states.pt. +32: [2022-12-01 21:42:23,436] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_29-model_00-model_states.pt... + 0: [2022-12-01 21:42:23,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_06-model_01-model_states.pt. + 0: [2022-12-01 21:42:23,444] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_07-model_01-model_states.pt... +32: [2022-12-01 21:42:23,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_29-model_01-model_states.pt. +32: [2022-12-01 21:42:23,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_30-model_01-model_states.pt... + 0: [2022-12-01 21:42:23,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_07-model_01-model_states.pt. + 0: [2022-12-01 21:42:23,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_07-model_00-model_states.pt. + 0: [2022-12-01 21:42:23,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_08-model_01-model_states.pt... + 0: [2022-12-01 21:42:23,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_08-model_00-model_states.pt... +32: [2022-12-01 21:42:23,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_29-model_00-model_states.pt. +32: [2022-12-01 21:42:23,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_30-model_00-model_states.pt... + 0: [2022-12-01 21:42:23,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_08-model_01-model_states.pt. + 0: [2022-12-01 21:42:23,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_09-model_01-model_states.pt... + 0: [2022-12-01 21:42:23,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_08-model_00-model_states.pt. + 0: [2022-12-01 21:42:23,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_09-model_00-model_states.pt... +32: [2022-12-01 21:42:23,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_30-model_01-model_states.pt. +32: [2022-12-01 21:42:23,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_31-model_01-model_states.pt... +32: [2022-12-01 21:42:23,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_30-model_00-model_states.pt. +32: [2022-12-01 21:42:23,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_31-model_00-model_states.pt... + 0: [2022-12-01 21:42:24,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_09-model_01-model_states.pt. + 0: [2022-12-01 21:42:24,164] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_10-model_01-model_states.pt... + 0: [2022-12-01 21:42:24,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_09-model_00-model_states.pt. + 0: [2022-12-01 21:42:24,187] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_10-model_00-model_states.pt... +32: [2022-12-01 21:42:24,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_31-model_01-model_states.pt. +32: [2022-12-01 21:42:24,189] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_32-model_01-model_states.pt... +32: [2022-12-01 21:42:24,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_31-model_00-model_states.pt. +32: [2022-12-01 21:42:24,198] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_32-model_00-model_states.pt... + 0: [2022-12-01 21:42:24,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_10-model_01-model_states.pt. + 0: [2022-12-01 21:42:24,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_10-model_00-model_states.pt. + 0: [2022-12-01 21:42:24,422] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_11-model_01-model_states.pt... + 0: [2022-12-01 21:42:24,422] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_11-model_00-model_states.pt... +32: [2022-12-01 21:42:24,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_32-model_01-model_states.pt. +32: [2022-12-01 21:42:24,428] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_33-model_01-model_states.pt... +32: [2022-12-01 21:42:24,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_32-model_00-model_states.pt. +32: [2022-12-01 21:42:24,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_33-model_00-model_states.pt... +32: [2022-12-01 21:42:24,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_33-model_01-model_states.pt. +32: [2022-12-01 21:42:24,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_34-model_01-model_states.pt... +32: [2022-12-01 21:42:24,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_33-model_00-model_states.pt. +32: [2022-12-01 21:42:24,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_34-model_00-model_states.pt... + 0: [2022-12-01 21:42:24,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_11-model_00-model_states.pt. + 0: [2022-12-01 21:42:24,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_12-model_00-model_states.pt... + 0: [2022-12-01 21:42:24,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_11-model_01-model_states.pt. + 0: [2022-12-01 21:42:24,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_12-model_01-model_states.pt... +32: [2022-12-01 21:42:24,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_34-model_01-model_states.pt. +32: [2022-12-01 21:42:24,883] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_35-model_01-model_states.pt... +32: [2022-12-01 21:42:24,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_34-model_00-model_states.pt. +32: [2022-12-01 21:42:24,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_35-model_00-model_states.pt... + 0: [2022-12-01 21:42:24,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_12-model_00-model_states.pt. + 0: [2022-12-01 21:42:24,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_13-model_00-model_states.pt... + 0: [2022-12-01 21:42:24,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_12-model_01-model_states.pt. + 0: [2022-12-01 21:42:24,931] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_13-model_01-model_states.pt... +32: [2022-12-01 21:42:25,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_35-model_01-model_states.pt. +32: [2022-12-01 21:42:25,113] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_36-model_01-model_states.pt... +32: [2022-12-01 21:42:25,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_35-model_00-model_states.pt. +32: [2022-12-01 21:42:25,134] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_36-model_00-model_states.pt... + 0: [2022-12-01 21:42:25,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_13-model_01-model_states.pt. + 0: [2022-12-01 21:42:25,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_13-model_00-model_states.pt. + 0: [2022-12-01 21:42:25,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_14-model_01-model_states.pt... + 0: [2022-12-01 21:42:25,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_14-model_00-model_states.pt... +32: [2022-12-01 21:42:25,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_36-model_01-model_states.pt. +32: [2022-12-01 21:42:25,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_37-model_01-model_states.pt... +32: [2022-12-01 21:42:25,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_36-model_00-model_states.pt. +32: [2022-12-01 21:42:25,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_37-model_00-model_states.pt... + 0: [2022-12-01 21:42:25,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_14-model_00-model_states.pt. + 0: [2022-12-01 21:42:25,406] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_15-model_00-model_states.pt... + 0: [2022-12-01 21:42:25,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_14-model_01-model_states.pt. + 0: [2022-12-01 21:42:25,409] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_15-model_01-model_states.pt... +32: [2022-12-01 21:42:25,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_37-model_01-model_states.pt. +32: [2022-12-01 21:42:25,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_38-model_01-model_states.pt... +32: [2022-12-01 21:42:25,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_37-model_00-model_states.pt. +32: [2022-12-01 21:42:25,589] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_38-model_00-model_states.pt... + 0: [2022-12-01 21:42:25,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_15-model_01-model_states.pt. + 0: [2022-12-01 21:42:25,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_16-model_01-model_states.pt... + 0: [2022-12-01 21:42:25,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_15-model_00-model_states.pt. + 0: [2022-12-01 21:42:25,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_16-model_00-model_states.pt... +32: [2022-12-01 21:42:25,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_38-model_01-model_states.pt. +32: [2022-12-01 21:42:25,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_39-model_01-model_states.pt... +32: [2022-12-01 21:42:25,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_38-model_00-model_states.pt. +32: [2022-12-01 21:42:25,823] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_39-model_00-model_states.pt... + 0: [2022-12-01 21:42:25,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_16-model_01-model_states.pt. + 0: [2022-12-01 21:42:25,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_16-model_00-model_states.pt. + 0: [2022-12-01 21:42:25,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_17-model_01-model_states.pt... + 0: [2022-12-01 21:42:25,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_17-model_00-model_states.pt... +32: [2022-12-01 21:42:26,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_39-model_01-model_states.pt. +32: [2022-12-01 21:42:26,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_40-model_01-model_states.pt... +32: [2022-12-01 21:42:26,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_39-model_00-model_states.pt. +32: [2022-12-01 21:42:26,052] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_40-model_00-model_states.pt... + 0: [2022-12-01 21:42:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_17-model_00-model_states.pt. + 0: [2022-12-01 21:42:26,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_18-model_00-model_states.pt... + 0: [2022-12-01 21:42:26,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_17-model_01-model_states.pt. + 0: [2022-12-01 21:42:26,164] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_18-model_01-model_states.pt... +32: [2022-12-01 21:42:26,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_40-model_01-model_states.pt. +32: [2022-12-01 21:42:26,235] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_41-model_01-model_states.pt... +32: [2022-12-01 21:42:26,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_40-model_00-model_states.pt. +32: [2022-12-01 21:42:26,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_41-model_00-model_states.pt... + 0: [2022-12-01 21:42:26,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_18-model_01-model_states.pt. + 0: [2022-12-01 21:42:26,393] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_19-model_01-model_states.pt... + 0: [2022-12-01 21:42:26,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_18-model_00-model_states.pt. + 0: [2022-12-01 21:42:26,402] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_19-model_00-model_states.pt... +32: [2022-12-01 21:42:26,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_41-model_01-model_states.pt. +32: [2022-12-01 21:42:26,480] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_42-model_01-model_states.pt... +32: [2022-12-01 21:42:26,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_41-model_00-model_states.pt. +32: [2022-12-01 21:42:26,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_42-model_00-model_states.pt... + 0: [2022-12-01 21:42:26,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_19-model_01-model_states.pt. + 0: [2022-12-01 21:42:26,633] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_20-model_01-model_states.pt... + 0: [2022-12-01 21:42:26,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_19-model_00-model_states.pt. + 0: [2022-12-01 21:42:26,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_20-model_00-model_states.pt... +32: [2022-12-01 21:42:26,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_42-model_00-model_states.pt. +32: [2022-12-01 21:42:26,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_43-model_00-model_states.pt... +32: [2022-12-01 21:42:26,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_42-model_01-model_states.pt. +32: [2022-12-01 21:42:26,741] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_43-model_01-model_states.pt... + 0: [2022-12-01 21:42:26,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_20-model_00-model_states.pt. + 0: [2022-12-01 21:42:26,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_21-model_00-model_states.pt... + 0: [2022-12-01 21:42:26,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_20-model_01-model_states.pt. + 0: [2022-12-01 21:42:26,890] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_21-model_01-model_states.pt... +32: [2022-12-01 21:42:26,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_43-model_01-model_states.pt. +32: [2022-12-01 21:42:26,974] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_44-model_01-model_states.pt... +32: [2022-12-01 21:42:26,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_43-model_00-model_states.pt. +32: [2022-12-01 21:42:26,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_44-model_00-model_states.pt... + 0: [2022-12-01 21:42:27,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_21-model_01-model_states.pt. + 0: [2022-12-01 21:42:27,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_22-model_01-model_states.pt... + 0: [2022-12-01 21:42:27,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_21-model_00-model_states.pt. + 0: [2022-12-01 21:42:27,158] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_22-model_00-model_states.pt... +32: [2022-12-01 21:42:27,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_44-model_01-model_states.pt. +32: [2022-12-01 21:42:27,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_46-model_01-model_states.pt... +32: [2022-12-01 21:42:27,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_44-model_00-model_states.pt. +32: [2022-12-01 21:42:27,222] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_46-model_00-model_states.pt... +32: [2022-12-01 21:42:27,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_46-model_00-model_states.pt. +32: [2022-12-01 21:42:27,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/mp_rank_02_model_states.pt... +32: [2022-12-01 21:42:27,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_46-model_01-model_states.pt. +32: [2022-12-01 21:42:27,252] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/mp_rank_03_model_states.pt... +32: [2022-12-01 21:42:27,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/mp_rank_02_model_states.pt. +32: [2022-12-01 21:42:27,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/mp_rank_03_model_states.pt. + 0: [2022-12-01 21:42:27,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_22-model_00-model_states.pt. + 0: [2022-12-01 21:42:27,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_23-model_00-model_states.pt... + 0: [2022-12-01 21:42:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_22-model_01-model_states.pt. + 0: [2022-12-01 21:42:27,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/layer_23-model_01-model_states.pt... + 0: [2022-12-01 21:42:27,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_23-model_00-model_states.pt. + 0: [2022-12-01 21:42:27,628] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7beta/global_step2000/mp_rank_00_model_states.pt + 0: [2022-12-01 21:42:27,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/mp_rank_00_model_states.pt... + 0: [2022-12-01 21:42:27,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/layer_23-model_01-model_states.pt. + 0: [2022-12-01 21:42:27,645] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7beta/global_step2000/mp_rank_01_model_states.pt + 0: [2022-12-01 21:42:27,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/mp_rank_01_model_states.pt... + 0: [2022-12-01 21:42:27,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/mp_rank_00_model_states.pt. + 0: [2022-12-01 21:42:27,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/mp_rank_01_model_states.pt. +32: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... +32: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... +32: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... +32: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... + 0: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +32: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... +32: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... + 0: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... + 0: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +54: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... +54: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... +54: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... +54: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... +54: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... +56: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... +56: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... +56: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... +56: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... +56: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... +62: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... +62: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... +62: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... +62: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... +62: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... +60: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... +60: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... +60: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... +60: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... +60: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... +51: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... +51: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... +51: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... +51: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... +51: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... +63: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... +63: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... +63: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... +63: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... +63: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... +39: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... +39: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... +39: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... +39: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... +39: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... + 7: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... + 7: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... + 7: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +57: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... +57: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... +57: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... +57: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... +57: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... +43: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... +43: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... +43: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... +43: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... +43: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... +38: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... +38: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... +38: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... +38: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... +38: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... +36: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... +36: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... +36: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... +36: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... +36: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... +42: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... +42: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... +42: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... +42: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... +42: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... +58: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... +58: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... +58: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... +58: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... +58: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... +44: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... +44: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... +44: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... +44: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... +44: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... +34: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... +34: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... +34: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... +34: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... +34: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... +28: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... +28: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... +50: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... +50: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... +50: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... +50: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... +50: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... +46: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... +46: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... +46: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... +46: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... +46: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... +49: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... +49: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... +49: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... +49: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... +49: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... +20: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... +20: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... +41: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... +41: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... +41: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... +41: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... +41: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... +55: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... +55: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... +55: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... +55: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... +55: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... +37: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... +37: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... +37: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... +37: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... +37: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... +33: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... +33: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... +33: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... +33: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... +33: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... +52: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... +52: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... +52: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... +52: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... +52: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... +31: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... +61: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... +61: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... +61: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... +61: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... +61: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... +15: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... +15: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... +45: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... +45: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... +45: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... +45: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... +45: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... +47: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... +47: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... +47: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... +47: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... +47: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... + 3: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... + 3: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... + 9: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... + 9: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... + 9: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... + 9: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +59: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... +59: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... +59: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... +59: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... +59: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... +12: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +12: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... +12: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +12: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +12: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... + 2: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... + 2: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... + 2: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... +35: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... +35: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... +35: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... +35: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... +35: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... +19: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... +19: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... +19: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... +19: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... +19: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... +48: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... +48: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... +48: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... +48: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... +48: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... +53: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... +53: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... +53: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... +53: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... +53: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... +40: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... +40: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... +40: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... +40: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... +40: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... +30: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... +23: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... +16: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... +16: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... +16: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... + 0: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +54: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... +54: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... +56: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... +56: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... +56: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... +62: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... +62: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... +62: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... + 4: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... + 4: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... + 4: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... + 4: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... + 4: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +60: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... +51: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... +51: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... +51: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... +63: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... +63: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... +63: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... +17: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... +17: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... + 1: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... + 1: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... + 1: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... + 1: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... + 1: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +39: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... +39: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... +39: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... + 7: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +11: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... +57: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... +57: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... +57: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... +27: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... +27: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... +27: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... +27: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... +43: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... +43: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... +43: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... +25: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... +25: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... +25: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... +13: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +13: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +13: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +13: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +14: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +14: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +14: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +14: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +38: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... +38: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... +38: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... +22: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... +22: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... +36: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... +24: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... +24: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... +24: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... +24: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... +42: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... +42: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... +58: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... +58: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... +44: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... +44: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... +34: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... +28: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... +28: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... +28: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... +32: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... +50: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... +50: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... +50: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... +46: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... +46: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... +46: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... +26: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... +26: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... +26: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... +49: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... +49: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... +49: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... + 5: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... + 5: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... + 5: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... + 5: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +20: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... +20: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... +41: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... +41: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... +41: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... +55: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... +55: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... +55: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... +10: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +10: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... +10: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... +10: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... +37: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... + 8: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... + 8: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... + 8: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... + 8: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... + 8: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... +18: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... +18: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... +18: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... +18: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... +21: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... +21: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... +21: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... +21: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... +33: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... +33: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... +33: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... +52: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... +52: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... + 6: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... + 6: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... + 6: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... + 6: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... + 6: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +31: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... +31: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... +31: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... +31: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... +61: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... +61: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... +15: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +15: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +45: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... +47: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... + 3: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... + 3: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... + 3: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... + 3: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... + 3: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... + 9: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... + 9: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... +59: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... +59: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... +59: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... +12: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... +12: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... +12: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... + 2: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... +29: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... +29: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... +29: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... +35: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... +35: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... +19: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... +48: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... +48: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... +53: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... +53: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... +53: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... +40: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... +40: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... +40: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... +30: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... +23: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... +23: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... +23: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... +23: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... +23: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... +16: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... +16: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... +16: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... +16: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... + 0: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... +54: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... + 4: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +60: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... +60: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... +17: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... + 1: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... + 7: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... + 7: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... +11: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +11: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +11: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... +11: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +11: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... +27: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... +27: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... +27: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... +27: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... +25: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... +13: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... +13: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... +14: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... +14: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... +22: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... +22: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... +22: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... +36: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... +36: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... +24: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... +24: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... +42: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... +58: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... +44: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... +34: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... +34: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... +28: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... +32: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... +26: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... +26: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... +26: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... +26: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... + 5: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... +20: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... +20: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... +10: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +37: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... +37: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... + 8: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... +18: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... +18: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... +18: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... +21: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... +21: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... +52: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... + 6: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... +31: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... +31: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... +61: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... +15: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +15: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +45: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... +45: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... +47: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... +47: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... + 3: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... + 9: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... + 9: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... + 2: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... + 2: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +29: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... +29: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... +29: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... +35: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... +19: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... +19: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... +48: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... +30: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... +30: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... +23: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... +16: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... + 0: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... + 4: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... + 4: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... +17: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... +17: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... + 1: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... + 1: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... + 7: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +11: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +11: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... +25: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... +13: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... +13: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... +14: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... +14: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... +22: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... +22: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... +24: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... +24: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... +28: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... +26: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... + 5: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... + 5: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... + 5: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... +20: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... +20: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... +10: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... + 8: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... + 8: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... +18: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... +21: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... + 6: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... +31: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... +15: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... +15: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... + 2: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... +29: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... +29: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... +30: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... +30: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... +23: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... + 0: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +17: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... + 7: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... +25: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... +25: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... +22: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... +28: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... +10: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... +21: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... + 6: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... + 2: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... +30: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... +30: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... + 0: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... +17: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... +25: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... +10: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +17: [2022-12-01 21:42:27,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... + 0: [2022-12-01 21:42:28,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +32: [2022-12-01 21:42:28,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. +32: [2022-12-01 21:42:28,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt +32: [2022-12-01 21:42:28,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 0: [2022-12-01 21:42:28,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. + 0: [2022-12-01 21:42:28,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. + 0: [2022-12-01 21:42:28,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. + 0: [2022-12-01 21:42:28,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. + 0: [2022-12-01 21:42:28,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt + 0: [2022-12-01 21:42:28,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt + 0: [2022-12-01 21:42:28,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt + 0: [2022-12-01 21:42:28,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 0: [2022-12-01 21:42:28,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 0: [2022-12-01 21:42:28,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt + 0: [2022-12-01 21:42:28,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 0: [2022-12-01 21:42:28,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +32: [2022-12-01 21:42:28,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. +32: [2022-12-01 21:42:28,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt +32: [2022-12-01 21:42:28,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +32: [2022-12-01 21:42:28,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. +32: [2022-12-01 21:42:28,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. +32: [2022-12-01 21:42:28,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt +32: [2022-12-01 21:42:28,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt +32: [2022-12-01 21:42:28,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +32: [2022-12-01 21:42:28,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +46: [2022-12-01 21:42:28,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. +46: [2022-12-01 21:42:28,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt +46: [2022-12-01 21:42:28,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. +46: [2022-12-01 21:42:28,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +46: [2022-12-01 21:42:28,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt +46: [2022-12-01 21:42:28,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +46: [2022-12-01 21:42:28,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. +46: [2022-12-01 21:42:28,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt +46: [2022-12-01 21:42:28,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +43: [2022-12-01 21:42:28,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. +43: [2022-12-01 21:42:28,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt +43: [2022-12-01 21:42:28,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +58: [2022-12-01 21:42:28,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. +58: [2022-12-01 21:42:28,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. +58: [2022-12-01 21:42:28,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt +58: [2022-12-01 21:42:28,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt +58: [2022-12-01 21:42:28,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +58: [2022-12-01 21:42:28,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +27: [2022-12-01 21:42:28,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. +27: [2022-12-01 21:42:28,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. +27: [2022-12-01 21:42:28,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt +27: [2022-12-01 21:42:28,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt +27: [2022-12-01 21:42:28,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +27: [2022-12-01 21:42:28,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 0: [2022-12-01 21:42:28,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. + 0: [2022-12-01 21:42:28,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt + 0: [2022-12-01 21:42:28,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 0: [2022-12-01 21:42:28,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. + 0: [2022-12-01 21:42:28,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt + 0: [2022-12-01 21:42:28,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 0: [2022-12-01 21:42:28,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. + 0: [2022-12-01 21:42:28,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt + 0: [2022-12-01 21:42:28,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +58: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. +58: [2022-12-01 21:42:28,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt +58: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +58: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. +58: [2022-12-01 21:42:28,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt +58: [2022-12-01 21:42:28,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +32: [2022-12-01 21:42:28,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. +32: [2022-12-01 21:42:28,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. +32: [2022-12-01 21:42:28,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. +32: [2022-12-01 21:42:28,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt +32: [2022-12-01 21:42:28,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt +32: [2022-12-01 21:42:28,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. +32: [2022-12-01 21:42:28,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +32: [2022-12-01 21:42:28,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +32: [2022-12-01 21:42:28,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt +32: [2022-12-01 21:42:28,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt +32: [2022-12-01 21:42:28,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +32: [2022-12-01 21:42:28,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +36: [2022-12-01 21:42:28,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. +36: [2022-12-01 21:42:28,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. +36: [2022-12-01 21:42:28,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt +36: [2022-12-01 21:42:28,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt +50: [2022-12-01 21:42:28,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. +36: [2022-12-01 21:42:28,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +36: [2022-12-01 21:42:28,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +50: [2022-12-01 21:42:28,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt +50: [2022-12-01 21:42:28,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. +55: [2022-12-01 21:42:28,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. +55: [2022-12-01 21:42:28,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. +55: [2022-12-01 21:42:28,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt +55: [2022-12-01 21:42:28,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt +55: [2022-12-01 21:42:28,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +55: [2022-12-01 21:42:28,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +43: [2022-12-01 21:42:28,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. +46: [2022-12-01 21:42:28,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. +43: [2022-12-01 21:42:28,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt +46: [2022-12-01 21:42:28,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt +46: [2022-12-01 21:42:28,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +43: [2022-12-01 21:42:28,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +43: [2022-12-01 21:42:28,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. +43: [2022-12-01 21:42:28,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt +43: [2022-12-01 21:42:28,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +37: [2022-12-01 21:42:28,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. +37: [2022-12-01 21:42:28,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. +37: [2022-12-01 21:42:28,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt +37: [2022-12-01 21:42:28,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt +37: [2022-12-01 21:42:28,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +37: [2022-12-01 21:42:28,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt +56: [2022-12-01 21:42:28,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +37: [2022-12-01 21:42:28,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. +56: [2022-12-01 21:42:28,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. +37: [2022-12-01 21:42:28,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt +37: [2022-12-01 21:42:28,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +37: [2022-12-01 21:42:28,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. +50: [2022-12-01 21:42:28,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. +50: [2022-12-01 21:42:28,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt +50: [2022-12-01 21:42:28,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +36: [2022-12-01 21:42:28,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. +36: [2022-12-01 21:42:28,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt +36: [2022-12-01 21:42:28,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +50: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. +50: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt +50: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +28: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. +28: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. +28: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. +28: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. +28: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt +28: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt +28: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt +28: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt +28: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +28: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +28: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. +16: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. +16: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. +16: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. +28: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt +16: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt +16: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt +16: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt +16: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +36: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. +16: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +36: [2022-12-01 21:42:28,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt +37: [2022-12-01 21:42:28,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt +36: [2022-12-01 21:42:28,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +37: [2022-12-01 21:42:28,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +46: [2022-12-01 21:42:28,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. +46: [2022-12-01 21:42:28,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt +46: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +15: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +15: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. +15: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +15: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. +15: [2022-12-01 21:42:28,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +15: [2022-12-01 21:42:28,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt +15: [2022-12-01 21:42:28,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +15: [2022-12-01 21:42:28,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt +15: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +15: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +15: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +15: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +11: [2022-12-01 21:42:28,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. +11: [2022-12-01 21:42:28,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. +11: [2022-12-01 21:42:28,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +22: [2022-12-01 21:42:28,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. +22: [2022-12-01 21:42:28,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. +44: [2022-12-01 21:42:28,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. + 6: [2022-12-01 21:42:28,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. + 6: [2022-12-01 21:42:28,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +11: [2022-12-01 21:42:28,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt +22: [2022-12-01 21:42:28,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt +44: [2022-12-01 21:42:28,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt + 6: [2022-12-01 21:42:28,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt + 6: [2022-12-01 21:42:28,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt +11: [2022-12-01 21:42:28,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt +11: [2022-12-01 21:42:28,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +22: [2022-12-01 21:42:28,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt +22: [2022-12-01 21:42:28,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +44: [2022-12-01 21:42:28,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 6: [2022-12-01 21:42:28,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 6: [2022-12-01 21:42:28,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +11: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +22: [2022-12-01 21:42:28,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 6: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. +11: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +11: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +22: [2022-12-01 21:42:28,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. + 6: [2022-12-01 21:42:28,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt +22: [2022-12-01 21:42:28,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. + 6: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +22: [2022-12-01 21:42:28,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt + 6: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +22: [2022-12-01 21:42:28,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt + 6: [2022-12-01 21:42:28,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +22: [2022-12-01 21:42:28,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 6: [2022-12-01 21:42:28,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +22: [2022-12-01 21:42:28,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +43: [2022-12-01 21:42:28,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. +43: [2022-12-01 21:42:28,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt +43: [2022-12-01 21:42:28,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +55: [2022-12-01 21:42:28,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. +38: [2022-12-01 21:42:28,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. +38: [2022-12-01 21:42:28,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. +38: [2022-12-01 21:42:28,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt +38: [2022-12-01 21:42:28,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt +38: [2022-12-01 21:42:28,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +38: [2022-12-01 21:42:28,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +55: [2022-12-01 21:42:28,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt +55: [2022-12-01 21:42:28,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +19: [2022-12-01 21:42:28,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. +19: [2022-12-01 21:42:28,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. +19: [2022-12-01 21:42:28,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt +19: [2022-12-01 21:42:28,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt +55: [2022-12-01 21:42:28,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. +19: [2022-12-01 21:42:28,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +19: [2022-12-01 21:42:28,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +55: [2022-12-01 21:42:28,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt +55: [2022-12-01 21:42:28,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +31: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. +31: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. +31: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. +31: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. +31: [2022-12-01 21:42:28,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt +31: [2022-12-01 21:42:28,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt +31: [2022-12-01 21:42:28,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt +31: [2022-12-01 21:42:28,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt +31: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +31: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +31: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +31: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +21: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. +21: [2022-12-01 21:42:28,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt +21: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. +21: [2022-12-01 21:42:28,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +21: [2022-12-01 21:42:28,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt +21: [2022-12-01 21:42:28,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +18: [2022-12-01 21:42:28,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. +18: [2022-12-01 21:42:28,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt +18: [2022-12-01 21:42:28,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +43: [2022-12-01 21:42:28,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. +43: [2022-12-01 21:42:28,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt +43: [2022-12-01 21:42:28,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +20: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. +20: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. +20: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. +20: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. +20: [2022-12-01 21:42:28,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt +20: [2022-12-01 21:42:28,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt +20: [2022-12-01 21:42:28,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt +20: [2022-12-01 21:42:28,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt +20: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +20: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +20: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +20: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +18: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. +18: [2022-12-01 21:42:28,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt +18: [2022-12-01 21:42:28,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +21: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +21: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt +21: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 3: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt +18: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. + 3: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt + 3: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt + 3: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt +21: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. + 3: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +18: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt +21: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +18: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +21: [2022-12-01 21:42:28,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 3: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 2: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. + 2: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. + 2: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt + 2: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. + 2: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt + 2: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 2: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. + 2: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt + 2: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +35: [2022-12-01 21:42:28,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. + 2: [2022-12-01 21:42:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt + 2: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +35: [2022-12-01 21:42:28,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt + 2: [2022-12-01 21:42:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +35: [2022-12-01 21:42:28,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +35: [2022-12-01 21:42:28,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. +35: [2022-12-01 21:42:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt +35: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +49: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. +49: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. +49: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. +49: [2022-12-01 21:42:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt +49: [2022-12-01 21:42:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt +49: [2022-12-01 21:42:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt +49: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +49: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +49: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 4: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. + 4: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. + 4: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. + 4: [2022-12-01 21:42:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. + 4: [2022-12-01 21:42:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt + 4: [2022-12-01 21:42:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt + 4: [2022-12-01 21:42:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt + 4: [2022-12-01 21:42:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt + 4: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 4: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 4: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 4: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +13: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. +13: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. +13: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +12: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +12: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +12: [2022-12-01 21:42:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +13: [2022-12-01 21:42:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt +12: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +12: [2022-12-01 21:42:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +13: [2022-12-01 21:42:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt +13: [2022-12-01 21:42:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +13: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +12: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +12: [2022-12-01 21:42:28,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. +12: [2022-12-01 21:42:28,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt +12: [2022-12-01 21:42:28,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +12: [2022-12-01 21:42:28,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. +12: [2022-12-01 21:42:28,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt +12: [2022-12-01 21:42:28,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +27: [2022-12-01 21:42:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. +27: [2022-12-01 21:42:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. +27: [2022-12-01 21:42:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt +27: [2022-12-01 21:42:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt +27: [2022-12-01 21:42:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +27: [2022-12-01 21:42:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt +56: [2022-12-01 21:42:28,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. +56: [2022-12-01 21:42:28,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt +56: [2022-12-01 21:42:28,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. +56: [2022-12-01 21:42:28,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt +56: [2022-12-01 21:42:28,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. +39: [2022-12-01 21:42:28,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. +39: [2022-12-01 21:42:28,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. +44: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. +41: [2022-12-01 21:42:28,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. +41: [2022-12-01 21:42:28,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. +41: [2022-12-01 21:42:28,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. +48: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. +48: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. +41: [2022-12-01 21:42:28,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt +41: [2022-12-01 21:42:28,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt +41: [2022-12-01 21:42:28,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt +41: [2022-12-01 21:42:28,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +41: [2022-12-01 21:42:28,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +41: [2022-12-01 21:42:28,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +44: [2022-12-01 21:42:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt +41: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. +48: [2022-12-01 21:42:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt +48: [2022-12-01 21:42:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt +44: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +41: [2022-12-01 21:42:28,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt +48: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +48: [2022-12-01 21:42:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +41: [2022-12-01 21:42:28,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +48: [2022-12-01 21:42:28,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. +41: [2022-12-01 21:42:28,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. +48: [2022-12-01 21:42:28,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt +41: [2022-12-01 21:42:28,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt +48: [2022-12-01 21:42:28,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +41: [2022-12-01 21:42:28,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +48: [2022-12-01 21:42:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. +48: [2022-12-01 21:42:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt +48: [2022-12-01 21:42:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. +39: [2022-12-01 21:42:28,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt +39: [2022-12-01 21:42:28,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt +39: [2022-12-01 21:42:28,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt +39: [2022-12-01 21:42:28,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt +39: [2022-12-01 21:42:28,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +62: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. +62: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. +62: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. +62: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. +26: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. +26: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. +26: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. +26: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. +35: [2022-12-01 21:42:28,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. +35: [2022-12-01 21:42:28,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. +35: [2022-12-01 21:42:28,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt +35: [2022-12-01 21:42:28,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt +35: [2022-12-01 21:42:28,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +35: [2022-12-01 21:42:28,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +62: [2022-12-01 21:42:28,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt +62: [2022-12-01 21:42:28,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt +62: [2022-12-01 21:42:28,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt +62: [2022-12-01 21:42:28,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt +62: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +62: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +62: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +62: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +38: [2022-12-01 21:42:28,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. +38: [2022-12-01 21:42:28,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. +38: [2022-12-01 21:42:28,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt +38: [2022-12-01 21:42:28,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +38: [2022-12-01 21:42:28,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt +38: [2022-12-01 21:42:28,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. +61: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. +61: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. +47: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. +47: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. +47: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. +47: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. +47: [2022-12-01 21:42:28,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt +47: [2022-12-01 21:42:28,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt +47: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +47: [2022-12-01 21:42:28,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt +47: [2022-12-01 21:42:28,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt +47: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +47: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +47: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. +28: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. +61: [2022-12-01 21:42:28,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt +61: [2022-12-01 21:42:28,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt +61: [2022-12-01 21:42:28,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt +61: [2022-12-01 21:42:28,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt +61: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +28: [2022-12-01 21:42:28,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt +28: [2022-12-01 21:42:28,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +10: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. +10: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +10: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +10: [2022-12-01 21:42:28,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +10: [2022-12-01 21:42:28,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +10: [2022-12-01 21:42:28,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt +10: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +10: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +10: [2022-12-01 21:42:28,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +14: [2022-12-01 21:42:28,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. +14: [2022-12-01 21:42:28,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. +14: [2022-12-01 21:42:28,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt +14: [2022-12-01 21:42:28,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt +14: [2022-12-01 21:42:28,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +14: [2022-12-01 21:42:28,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +59: [2022-12-01 21:42:28,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. +59: [2022-12-01 21:42:28,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. +59: [2022-12-01 21:42:28,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. +59: [2022-12-01 21:42:28,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt +59: [2022-12-01 21:42:28,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt +59: [2022-12-01 21:42:28,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt +59: [2022-12-01 21:42:28,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +59: [2022-12-01 21:42:28,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +59: [2022-12-01 21:42:28,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +59: [2022-12-01 21:42:28,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. +59: [2022-12-01 21:42:28,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt +59: [2022-12-01 21:42:28,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +58: [2022-12-01 21:42:28,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. +58: [2022-12-01 21:42:28,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt +58: [2022-12-01 21:42:28,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 8: [2022-12-01 21:42:28,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. + 8: [2022-12-01 21:42:28,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. + 8: [2022-12-01 21:42:28,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. + 8: [2022-12-01 21:42:28,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. + 8: [2022-12-01 21:42:28,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt + 8: [2022-12-01 21:42:28,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt + 8: [2022-12-01 21:42:28,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt + 8: [2022-12-01 21:42:28,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt + 8: [2022-12-01 21:42:28,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 8: [2022-12-01 21:42:28,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 8: [2022-12-01 21:42:28,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 8: [2022-12-01 21:42:28,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +14: [2022-12-01 21:42:28,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. +14: [2022-12-01 21:42:28,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt +14: [2022-12-01 21:42:28,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +38: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. +17: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. +17: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. +17: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. +17: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. +17: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt +17: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. +17: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt +17: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt +17: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +17: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +17: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +17: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. +40: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt +40: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt +40: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt +40: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. +40: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. +45: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt +40: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt +45: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. +40: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt +45: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +38: [2022-12-01 21:42:28,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt +38: [2022-12-01 21:42:28,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +60: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. +60: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. +40: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. +40: [2022-12-01 21:42:28,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt +40: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +30: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. +30: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. +30: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. +30: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. +30: [2022-12-01 21:42:28,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt +30: [2022-12-01 21:42:28,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt +30: [2022-12-01 21:42:28,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt +30: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +30: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +30: [2022-12-01 21:42:28,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt +30: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +30: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +50: [2022-12-01 21:42:28,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. +50: [2022-12-01 21:42:28,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt +50: [2022-12-01 21:42:28,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +14: [2022-12-01 21:42:28,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +14: [2022-12-01 21:42:28,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +14: [2022-12-01 21:42:28,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +24: [2022-12-01 21:42:28,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. +24: [2022-12-01 21:42:28,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt +24: [2022-12-01 21:42:28,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +24: [2022-12-01 21:42:28,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. +24: [2022-12-01 21:42:28,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt +24: [2022-12-01 21:42:28,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +18: [2022-12-01 21:42:28,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. +18: [2022-12-01 21:42:28,257] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt +18: [2022-12-01 21:42:28,257] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 9: [2022-12-01 21:42:28,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. + 9: [2022-12-01 21:42:28,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. + 9: [2022-12-01 21:42:28,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. + 9: [2022-12-01 21:42:28,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. + 9: [2022-12-01 21:42:28,259] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt + 9: [2022-12-01 21:42:28,259] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt + 9: [2022-12-01 21:42:28,259] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt + 9: [2022-12-01 21:42:28,259] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt + 9: [2022-12-01 21:42:28,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 9: [2022-12-01 21:42:28,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 9: [2022-12-01 21:42:28,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 9: [2022-12-01 21:42:28,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +24: [2022-12-01 21:42:28,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. +24: [2022-12-01 21:42:28,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. +24: [2022-12-01 21:42:28,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt +24: [2022-12-01 21:42:28,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt +24: [2022-12-01 21:42:28,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +24: [2022-12-01 21:42:28,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +46: [2022-12-01 21:42:28,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. +46: [2022-12-01 21:42:28,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt +46: [2022-12-01 21:42:28,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +25: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. +25: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. +25: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. +25: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. +25: [2022-12-01 21:42:28,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt +25: [2022-12-01 21:42:28,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt +25: [2022-12-01 21:42:28,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt +25: [2022-12-01 21:42:28,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt +25: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +25: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +25: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +25: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +57: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. +57: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. +57: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. +57: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. +57: [2022-12-01 21:42:28,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. +57: [2022-12-01 21:42:28,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt +57: [2022-12-01 21:42:28,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt +57: [2022-12-01 21:42:28,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt +57: [2022-12-01 21:42:28,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt +57: [2022-12-01 21:42:28,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt +57: [2022-12-01 21:42:28,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +57: [2022-12-01 21:42:28,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +57: [2022-12-01 21:42:28,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +57: [2022-12-01 21:42:28,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +57: [2022-12-01 21:42:28,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +35: [2022-12-01 21:42:28,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. +35: [2022-12-01 21:42:28,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt +35: [2022-12-01 21:42:28,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +42: [2022-12-01 21:42:28,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. +42: [2022-12-01 21:42:28,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. +42: [2022-12-01 21:42:28,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. +42: [2022-12-01 21:42:28,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt +42: [2022-12-01 21:42:28,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt +42: [2022-12-01 21:42:28,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt +42: [2022-12-01 21:42:28,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +42: [2022-12-01 21:42:28,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +42: [2022-12-01 21:42:28,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +42: [2022-12-01 21:42:28,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. +42: [2022-12-01 21:42:28,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. +42: [2022-12-01 21:42:28,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt +42: [2022-12-01 21:42:28,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +42: [2022-12-01 21:42:28,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt +42: [2022-12-01 21:42:28,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +19: [2022-12-01 21:42:28,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. +19: [2022-12-01 21:42:28,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt +19: [2022-12-01 21:42:28,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +11: [2022-12-01 21:42:28,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. +34: [2022-12-01 21:42:28,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. +26: [2022-12-01 21:42:28,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt +26: [2022-12-01 21:42:28,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt + 5: [2022-12-01 21:42:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. + 5: [2022-12-01 21:42:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. + 5: [2022-12-01 21:42:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. + 5: [2022-12-01 21:42:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. +41: [2022-12-01 21:42:28,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. + 6: [2022-12-01 21:42:28,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +29: [2022-12-01 21:42:28,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. +29: [2022-12-01 21:42:28,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. +29: [2022-12-01 21:42:28,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. +29: [2022-12-01 21:42:28,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. +26: [2022-12-01 21:42:28,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt +26: [2022-12-01 21:42:28,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt +26: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +26: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +34: [2022-12-01 21:42:28,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt +26: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +26: [2022-12-01 21:42:28,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 5: [2022-12-01 21:42:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt + 5: [2022-12-01 21:42:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt + 5: [2022-12-01 21:42:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt + 5: [2022-12-01 21:42:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt +41: [2022-12-01 21:42:28,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt + 6: [2022-12-01 21:42:28,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +29: [2022-12-01 21:42:28,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt +29: [2022-12-01 21:42:28,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt +11: [2022-12-01 21:42:28,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt +34: [2022-12-01 21:42:28,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 5: [2022-12-01 21:42:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 5: [2022-12-01 21:42:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +41: [2022-12-01 21:42:28,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 6: [2022-12-01 21:42:28,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +29: [2022-12-01 21:42:28,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt +29: [2022-12-01 21:42:28,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt +11: [2022-12-01 21:42:28,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +34: [2022-12-01 21:42:28,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. + 5: [2022-12-01 21:42:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 5: [2022-12-01 21:42:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +29: [2022-12-01 21:42:28,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +29: [2022-12-01 21:42:28,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +57: [2022-12-01 21:42:28,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. +34: [2022-12-01 21:42:28,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. +29: [2022-12-01 21:42:28,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +29: [2022-12-01 21:42:28,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +34: [2022-12-01 21:42:28,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt +34: [2022-12-01 21:42:28,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt +34: [2022-12-01 21:42:28,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +34: [2022-12-01 21:42:28,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +34: [2022-12-01 21:42:28,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. +34: [2022-12-01 21:42:28,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt +60: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. +60: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. +34: [2022-12-01 21:42:28,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +60: [2022-12-01 21:42:28,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt +60: [2022-12-01 21:42:28,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt +60: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +60: [2022-12-01 21:42:28,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt +60: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +60: [2022-12-01 21:42:28,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt +60: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +60: [2022-12-01 21:42:28,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +57: [2022-12-01 21:42:28,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt +57: [2022-12-01 21:42:28,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +19: [2022-12-01 21:42:28,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. +19: [2022-12-01 21:42:28,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt +19: [2022-12-01 21:42:28,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 7: [2022-12-01 21:42:28,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. + 7: [2022-12-01 21:42:28,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. + 7: [2022-12-01 21:42:28,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt + 7: [2022-12-01 21:42:28,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. + 7: [2022-12-01 21:42:28,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt + 7: [2022-12-01 21:42:28,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 7: [2022-12-01 21:42:28,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 7: [2022-12-01 21:42:28,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt + 7: [2022-12-01 21:42:28,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +10: [2022-12-01 21:42:28,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. +10: [2022-12-01 21:42:28,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt +10: [2022-12-01 21:42:28,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 1: [2022-12-01 21:42:28,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. + 1: [2022-12-01 21:42:28,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. + 1: [2022-12-01 21:42:28,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt + 1: [2022-12-01 21:42:28,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt + 1: [2022-12-01 21:42:28,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 1: [2022-12-01 21:42:28,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +10: [2022-12-01 21:42:28,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +10: [2022-12-01 21:42:28,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +10: [2022-12-01 21:42:28,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +42: [2022-12-01 21:42:28,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. +42: [2022-12-01 21:42:28,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt +42: [2022-12-01 21:42:28,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 1: [2022-12-01 21:42:28,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. + 1: [2022-12-01 21:42:28,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt + 1: [2022-12-01 21:42:28,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +35: [2022-12-01 21:42:28,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. +35: [2022-12-01 21:42:28,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt +35: [2022-12-01 21:42:28,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 7: [2022-12-01 21:42:28,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. + 7: [2022-12-01 21:42:28,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt + 7: [2022-12-01 21:42:28,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 7: [2022-12-01 21:42:28,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. + 7: [2022-12-01 21:42:28,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt + 7: [2022-12-01 21:42:28,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +44: [2022-12-01 21:42:28,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. +23: [2022-12-01 21:42:28,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. +23: [2022-12-01 21:42:28,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. +23: [2022-12-01 21:42:28,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. +23: [2022-12-01 21:42:28,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. + 7: [2022-12-01 21:42:28,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. +44: [2022-12-01 21:42:28,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt +23: [2022-12-01 21:42:28,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt +23: [2022-12-01 21:42:28,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt + 7: [2022-12-01 21:42:28,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt +44: [2022-12-01 21:42:28,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +23: [2022-12-01 21:42:28,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt +23: [2022-12-01 21:42:28,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt + 7: [2022-12-01 21:42:28,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +23: [2022-12-01 21:42:28,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +23: [2022-12-01 21:42:28,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +23: [2022-12-01 21:42:28,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +23: [2022-12-01 21:42:28,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +23: [2022-12-01 21:42:28,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. +23: [2022-12-01 21:42:28,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt +23: [2022-12-01 21:42:28,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 1: [2022-12-01 21:42:28,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. + 1: [2022-12-01 21:42:28,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt + 1: [2022-12-01 21:42:28,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +35: [2022-12-01 21:42:28,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. +35: [2022-12-01 21:42:28,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt +35: [2022-12-01 21:42:28,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. +53: [2022-12-01 21:42:28,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt +53: [2022-12-01 21:42:28,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +53: [2022-12-01 21:42:28,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt +53: [2022-12-01 21:42:28,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt +53: [2022-12-01 21:42:28,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +53: [2022-12-01 21:42:28,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +53: [2022-12-01 21:42:28,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +44: [2022-12-01 21:42:28,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. +34: [2022-12-01 21:42:28,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. +44: [2022-12-01 21:42:28,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt +34: [2022-12-01 21:42:28,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt +44: [2022-12-01 21:42:28,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +34: [2022-12-01 21:42:28,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +34: [2022-12-01 21:42:28,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. +34: [2022-12-01 21:42:28,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt +34: [2022-12-01 21:42:28,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +34: [2022-12-01 21:42:28,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. +34: [2022-12-01 21:42:28,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt +34: [2022-12-01 21:42:28,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +34: [2022-12-01 21:42:28,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. +34: [2022-12-01 21:42:28,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt +34: [2022-12-01 21:42:28,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. +63: [2022-12-01 21:42:28,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. +63: [2022-12-01 21:42:28,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt +63: [2022-12-01 21:42:28,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +63: [2022-12-01 21:42:28,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +63: [2022-12-01 21:42:28,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt +63: [2022-12-01 21:42:28,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt +63: [2022-12-01 21:42:28,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +63: [2022-12-01 21:42:28,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +37: [2022-12-01 21:42:28,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. +37: [2022-12-01 21:42:28,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt +37: [2022-12-01 21:42:28,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 2: [2022-12-01 21:42:28,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. + 2: [2022-12-01 21:42:28,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt + 2: [2022-12-01 21:42:28,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 1: [2022-12-01 21:42:28,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. + 1: [2022-12-01 21:42:28,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt + 1: [2022-12-01 21:42:28,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +11: [2022-12-01 21:42:28,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +11: [2022-12-01 21:42:28,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +11: [2022-12-01 21:42:28,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +35: [2022-12-01 21:42:28,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. +35: [2022-12-01 21:42:28,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt +35: [2022-12-01 21:42:28,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 0: [2022-12-01 21:42:28,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt + 0: [2022-12-01 21:42:28,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +31: [2022-12-01 21:42:28,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. +31: [2022-12-01 21:42:28,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt +31: [2022-12-01 21:42:28,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +15: [2022-12-01 21:42:28,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +15: [2022-12-01 21:42:28,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +15: [2022-12-01 21:42:28,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 3: [2022-12-01 21:42:28,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. + 3: [2022-12-01 21:42:28,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt + 3: [2022-12-01 21:42:28,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +33: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. +33: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. +33: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. +33: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. +54: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. +33: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. +33: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. +54: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt +33: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. +33: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. +33: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt +33: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt +33: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt +54: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +33: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt +33: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt +33: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt +54: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. +33: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +33: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt +33: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt +33: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +33: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +54: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt +54: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. +33: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +33: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +54: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +33: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +33: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +33: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +54: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt +54: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +54: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. +54: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt +54: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. +54: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +54: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt +54: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. +54: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +54: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt +54: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. +54: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +54: [2022-12-01 21:42:28,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. +54: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt +54: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +54: [2022-12-01 21:42:28,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt +54: [2022-12-01 21:42:28,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +26: [2022-12-01 21:42:28,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. +26: [2022-12-01 21:42:28,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt +26: [2022-12-01 21:42:28,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. +38: [2022-12-01 21:42:28,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. +38: [2022-12-01 21:42:28,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt +38: [2022-12-01 21:42:28,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt +39: [2022-12-01 21:42:28,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +30: [2022-12-01 21:42:28,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. +30: [2022-12-01 21:42:28,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt +30: [2022-12-01 21:42:28,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +19: [2022-12-01 21:42:28,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. +19: [2022-12-01 21:42:28,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt +19: [2022-12-01 21:42:28,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +51: [2022-12-01 21:42:28,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. +51: [2022-12-01 21:42:28,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. +51: [2022-12-01 21:42:28,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. +51: [2022-12-01 21:42:28,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. +51: [2022-12-01 21:42:28,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. +51: [2022-12-01 21:42:28,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +51: [2022-12-01 21:42:28,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt +51: [2022-12-01 21:42:28,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt +51: [2022-12-01 21:42:28,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt +51: [2022-12-01 21:42:28,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt +51: [2022-12-01 21:42:28,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt +51: [2022-12-01 21:42:28,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt +51: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +22: [2022-12-01 21:42:28,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. +22: [2022-12-01 21:42:28,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt +22: [2022-12-01 21:42:28,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +55: [2022-12-01 21:42:28,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. +55: [2022-12-01 21:42:28,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt +61: [2022-12-01 21:42:28,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. +55: [2022-12-01 21:42:28,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt +61: [2022-12-01 21:42:28,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 9: [2022-12-01 21:42:28,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. + 9: [2022-12-01 21:42:28,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt + 9: [2022-12-01 21:42:28,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +28: [2022-12-01 21:42:28,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. +28: [2022-12-01 21:42:28,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt +28: [2022-12-01 21:42:28,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +63: [2022-12-01 21:42:28,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. +63: [2022-12-01 21:42:28,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt +63: [2022-12-01 21:42:28,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +20: [2022-12-01 21:42:28,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. +20: [2022-12-01 21:42:28,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt +20: [2022-12-01 21:42:28,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +14: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +14: [2022-12-01 21:42:28,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +14: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +52: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. +52: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. +52: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. +52: [2022-12-01 21:42:28,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt +52: [2022-12-01 21:42:28,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt +52: [2022-12-01 21:42:28,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt +52: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +52: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +52: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +52: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. +52: [2022-12-01 21:42:28,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt +52: [2022-12-01 21:42:28,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +52: [2022-12-01 21:42:28,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. +52: [2022-12-01 21:42:28,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt +52: [2022-12-01 21:42:28,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. +52: [2022-12-01 21:42:28,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +52: [2022-12-01 21:42:28,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt +52: [2022-12-01 21:42:28,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +52: [2022-12-01 21:42:28,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. +52: [2022-12-01 21:42:28,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt +52: [2022-12-01 21:42:28,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. +52: [2022-12-01 21:42:28,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +52: [2022-12-01 21:42:28,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt +52: [2022-12-01 21:42:28,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +10: [2022-12-01 21:42:28,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +10: [2022-12-01 21:42:28,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +10: [2022-12-01 21:42:28,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 5: [2022-12-01 21:42:28,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. + 5: [2022-12-01 21:42:28,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt + 5: [2022-12-01 21:42:28,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 7: [2022-12-01 21:42:28,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. + 7: [2022-12-01 21:42:28,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt + 7: [2022-12-01 21:42:28,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. +56: [2022-12-01 21:42:28,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt +56: [2022-12-01 21:42:28,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +17: [2022-12-01 21:42:28,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. +17: [2022-12-01 21:42:28,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt +17: [2022-12-01 21:42:28,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +62: [2022-12-01 21:42:28,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. +62: [2022-12-01 21:42:28,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt +62: [2022-12-01 21:42:28,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +27: [2022-12-01 21:42:28,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. +27: [2022-12-01 21:42:28,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt + 4: [2022-12-01 21:42:28,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +27: [2022-12-01 21:42:28,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 4: [2022-12-01 21:42:28,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt + 4: [2022-12-01 21:42:28,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. +13: [2022-12-01 21:42:28,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt +13: [2022-12-01 21:42:28,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +41: [2022-12-01 21:42:28,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. +41: [2022-12-01 21:42:28,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt +41: [2022-12-01 21:42:28,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +48: [2022-12-01 21:42:28,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. +48: [2022-12-01 21:42:28,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt +48: [2022-12-01 21:42:28,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +36: [2022-12-01 21:42:28,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. +36: [2022-12-01 21:42:28,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt +36: [2022-12-01 21:42:28,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +40: [2022-12-01 21:42:28,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. +40: [2022-12-01 21:42:28,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt +40: [2022-12-01 21:42:28,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +18: [2022-12-01 21:42:28,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. +18: [2022-12-01 21:42:28,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt +18: [2022-12-01 21:42:28,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +44: [2022-12-01 21:42:28,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. +44: [2022-12-01 21:42:28,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt +44: [2022-12-01 21:42:28,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. +45: [2022-12-01 21:42:28,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt +45: [2022-12-01 21:42:28,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +43: [2022-12-01 21:42:28,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. +43: [2022-12-01 21:42:28,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt +43: [2022-12-01 21:42:28,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +53: [2022-12-01 21:42:28,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. +53: [2022-12-01 21:42:28,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt +53: [2022-12-01 21:42:28,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +57: [2022-12-01 21:42:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. +57: [2022-12-01 21:42:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt +57: [2022-12-01 21:42:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +47: [2022-12-01 21:42:28,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. + 6: [2022-12-01 21:42:28,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. + 6: [2022-12-01 21:42:28,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt +28: [2022-12-01 21:42:28,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. +28: [2022-12-01 21:42:28,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt +28: [2022-12-01 21:42:28,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 6: [2022-12-01 21:42:28,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +47: [2022-12-01 21:42:28,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt +47: [2022-12-01 21:42:28,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +12: [2022-12-01 21:42:28,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. +12: [2022-12-01 21:42:28,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt +12: [2022-12-01 21:42:28,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 3: [2022-12-01 21:42:28,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. + 3: [2022-12-01 21:42:28,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt + 3: [2022-12-01 21:42:28,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +25: [2022-12-01 21:42:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. +25: [2022-12-01 21:42:28,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt +25: [2022-12-01 21:42:28,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +37: [2022-12-01 21:42:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. +37: [2022-12-01 21:42:28,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt +37: [2022-12-01 21:42:28,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +23: [2022-12-01 21:42:28,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. +23: [2022-12-01 21:42:28,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt +23: [2022-12-01 21:42:28,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +15: [2022-12-01 21:42:28,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. +15: [2022-12-01 21:42:28,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt +15: [2022-12-01 21:42:28,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +31: [2022-12-01 21:42:28,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. +31: [2022-12-01 21:42:28,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt +31: [2022-12-01 21:42:28,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +58: [2022-12-01 21:42:28,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. +58: [2022-12-01 21:42:28,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt +58: [2022-12-01 21:42:28,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. +16: [2022-12-01 21:42:28,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt +16: [2022-12-01 21:42:28,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +10: [2022-12-01 21:42:28,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. +10: [2022-12-01 21:42:28,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt +10: [2022-12-01 21:42:28,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 8: [2022-12-01 21:42:28,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. + 8: [2022-12-01 21:42:28,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt + 8: [2022-12-01 21:42:28,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +50: [2022-12-01 21:42:28,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. +50: [2022-12-01 21:42:28,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt +50: [2022-12-01 21:42:28,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +38: [2022-12-01 21:42:28,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. +38: [2022-12-01 21:42:28,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt +38: [2022-12-01 21:42:28,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 2: [2022-12-01 21:42:28,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. + 2: [2022-12-01 21:42:28,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt + 2: [2022-12-01 21:42:28,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +19: [2022-12-01 21:42:28,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. +42: [2022-12-01 21:42:28,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. +42: [2022-12-01 21:42:28,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt +19: [2022-12-01 21:42:28,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt +42: [2022-12-01 21:42:28,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +19: [2022-12-01 21:42:28,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +26: [2022-12-01 21:42:28,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. +26: [2022-12-01 21:42:28,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt +26: [2022-12-01 21:42:28,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. +39: [2022-12-01 21:42:28,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt +39: [2022-12-01 21:42:28,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +30: [2022-12-01 21:42:28,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. +30: [2022-12-01 21:42:28,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt +30: [2022-12-01 21:42:28,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +46: [2022-12-01 21:42:28,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. +46: [2022-12-01 21:42:28,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt +46: [2022-12-01 21:42:28,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. + 7: [2022-12-01 21:42:28,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. + 7: [2022-12-01 21:42:28,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt + 7: [2022-12-01 21:42:28,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +60: [2022-12-01 21:42:28,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. +25: [2022-12-01 21:42:28,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. +60: [2022-12-01 21:42:28,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt +60: [2022-12-01 21:42:28,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +25: [2022-12-01 21:42:28,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt +25: [2022-12-01 21:42:28,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +59: [2022-12-01 21:42:28,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. +21: [2022-12-01 21:42:28,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. +21: [2022-12-01 21:42:28,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt +21: [2022-12-01 21:42:28,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +27: [2022-12-01 21:42:28,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. +27: [2022-12-01 21:42:28,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt +27: [2022-12-01 21:42:28,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +22: [2022-12-01 21:42:28,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. +22: [2022-12-01 21:42:28,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt +22: [2022-12-01 21:42:28,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +17: [2022-12-01 21:42:28,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. +17: [2022-12-01 21:42:28,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt +17: [2022-12-01 21:42:28,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +55: [2022-12-01 21:42:28,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. +55: [2022-12-01 21:42:28,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt +55: [2022-12-01 21:42:28,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt +61: [2022-12-01 21:42:28,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +20: [2022-12-01 21:42:28,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. +20: [2022-12-01 21:42:28,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt +20: [2022-12-01 21:42:28,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. +45: [2022-12-01 21:42:28,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt +45: [2022-12-01 21:42:28,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +47: [2022-12-01 21:42:28,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. +41: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. +41: [2022-12-01 21:42:28,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt +41: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +18: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. +48: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. +18: [2022-12-01 21:42:28,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt +18: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +48: [2022-12-01 21:42:28,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt +48: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +24: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. +24: [2022-12-01 21:42:28,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt +11: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +24: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. +11: [2022-12-01 21:42:28,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +13: [2022-12-01 21:42:28,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt +11: [2022-12-01 21:42:28,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +44: [2022-12-01 21:42:28,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. +44: [2022-12-01 21:42:28,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt +44: [2022-12-01 21:42:28,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +36: [2022-12-01 21:42:28,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. +36: [2022-12-01 21:42:28,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt +36: [2022-12-01 21:42:28,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +29: [2022-12-01 21:42:28,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. +29: [2022-12-01 21:42:28,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt +29: [2022-12-01 21:42:28,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +43: [2022-12-01 21:42:28,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. +43: [2022-12-01 21:42:28,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt +43: [2022-12-01 21:42:28,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +21: [2022-12-01 21:42:28,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. + 4: [2022-12-01 21:42:28,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. +21: [2022-12-01 21:42:28,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt +21: [2022-12-01 21:42:28,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 4: [2022-12-01 21:42:28,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt + 4: [2022-12-01 21:42:28,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +47: [2022-12-01 21:42:28,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt +47: [2022-12-01 21:42:28,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +50: [2022-12-01 21:42:28,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. +50: [2022-12-01 21:42:28,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt +50: [2022-12-01 21:42:28,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 9: [2022-12-01 21:42:28,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. + 9: [2022-12-01 21:42:28,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt + 9: [2022-12-01 21:42:28,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +60: [2022-12-01 21:42:28,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. +60: [2022-12-01 21:42:28,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt +60: [2022-12-01 21:42:28,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +24: [2022-12-01 21:42:28,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. +24: [2022-12-01 21:42:28,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt +24: [2022-12-01 21:42:28,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. +56: [2022-12-01 21:42:28,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt +56: [2022-12-01 21:42:28,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +12: [2022-12-01 21:42:28,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +12: [2022-12-01 21:42:28,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +29: [2022-12-01 21:42:28,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. +12: [2022-12-01 21:42:28,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +29: [2022-12-01 21:42:28,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt +29: [2022-12-01 21:42:28,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 8: [2022-12-01 21:42:28,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. + 8: [2022-12-01 21:42:28,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt + 8: [2022-12-01 21:42:28,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +28: [2022-12-01 21:42:28,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. +28: [2022-12-01 21:42:28,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt +28: [2022-12-01 21:42:28,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +53: [2022-12-01 21:42:28,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. +53: [2022-12-01 21:42:28,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt +53: [2022-12-01 21:42:28,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +46: [2022-12-01 21:42:28,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. +46: [2022-12-01 21:42:28,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt +46: [2022-12-01 21:42:28,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +57: [2022-12-01 21:42:28,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. +57: [2022-12-01 21:42:28,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt +57: [2022-12-01 21:42:28,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +58: [2022-12-01 21:42:28,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. +58: [2022-12-01 21:42:28,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt +58: [2022-12-01 21:42:28,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +62: [2022-12-01 21:42:28,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. +62: [2022-12-01 21:42:28,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt +62: [2022-12-01 21:42:28,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 1: [2022-12-01 21:42:28,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. + 5: [2022-12-01 21:42:28,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. + 5: [2022-12-01 21:42:28,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt + 5: [2022-12-01 21:42:28,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +37: [2022-12-01 21:42:28,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. +37: [2022-12-01 21:42:28,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt +37: [2022-12-01 21:42:28,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +49: [2022-12-01 21:42:28,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. +49: [2022-12-01 21:42:28,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt +49: [2022-12-01 21:42:28,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 6: [2022-12-01 21:42:28,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. + 6: [2022-12-01 21:42:28,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt + 6: [2022-12-01 21:42:28,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +38: [2022-12-01 21:42:28,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. +38: [2022-12-01 21:42:28,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt +38: [2022-12-01 21:42:28,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +42: [2022-12-01 21:42:28,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. +42: [2022-12-01 21:42:28,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt +42: [2022-12-01 21:42:28,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 3: [2022-12-01 21:42:28,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. + 3: [2022-12-01 21:42:28,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt + 3: [2022-12-01 21:42:28,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. +39: [2022-12-01 21:42:28,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt +39: [2022-12-01 21:42:28,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 1: [2022-12-01 21:42:28,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt + 1: [2022-12-01 21:42:28,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 2: [2022-12-01 21:42:28,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. + 2: [2022-12-01 21:42:28,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt + 2: [2022-12-01 21:42:28,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. +16: [2022-12-01 21:42:28,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt +16: [2022-12-01 21:42:28,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +11: [2022-12-01 21:42:28,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +11: [2022-12-01 21:42:28,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +11: [2022-12-01 21:42:28,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +26: [2022-12-01 21:42:28,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. +26: [2022-12-01 21:42:28,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt +15: [2022-12-01 21:42:28,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. +26: [2022-12-01 21:42:28,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +15: [2022-12-01 21:42:28,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt +15: [2022-12-01 21:42:28,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +19: [2022-12-01 21:42:28,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. +19: [2022-12-01 21:42:28,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt +19: [2022-12-01 21:42:28,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +31: [2022-12-01 21:42:28,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. +31: [2022-12-01 21:42:28,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt +31: [2022-12-01 21:42:28,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +23: [2022-12-01 21:42:28,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. +23: [2022-12-01 21:42:28,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt +23: [2022-12-01 21:42:28,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. + 9: [2022-12-01 21:42:28,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. +61: [2022-12-01 21:42:28,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt + 9: [2022-12-01 21:42:28,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt +61: [2022-12-01 21:42:28,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 9: [2022-12-01 21:42:28,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +30: [2022-12-01 21:42:28,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. +30: [2022-12-01 21:42:28,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt +30: [2022-12-01 21:42:28,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +55: [2022-12-01 21:42:28,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. +55: [2022-12-01 21:42:28,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt +55: [2022-12-01 21:42:28,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +22: [2022-12-01 21:42:28,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. +22: [2022-12-01 21:42:28,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt +22: [2022-12-01 21:42:28,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +59: [2022-12-01 21:42:28,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt +59: [2022-12-01 21:42:28,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +59: [2022-12-01 21:42:28,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. +59: [2022-12-01 21:42:28,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt +59: [2022-12-01 21:42:28,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +20: [2022-12-01 21:42:28,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. +20: [2022-12-01 21:42:28,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt +20: [2022-12-01 21:42:28,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +27: [2022-12-01 21:42:28,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. +27: [2022-12-01 21:42:28,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt +27: [2022-12-01 21:42:28,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +25: [2022-12-01 21:42:28,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. +25: [2022-12-01 21:42:28,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt +25: [2022-12-01 21:42:28,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +18: [2022-12-01 21:42:28,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. +18: [2022-12-01 21:42:28,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt +18: [2022-12-01 21:42:28,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +45: [2022-12-01 21:42:28,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. +45: [2022-12-01 21:42:28,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt +45: [2022-12-01 21:42:28,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +62: [2022-12-01 21:42:28,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. +62: [2022-12-01 21:42:28,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt +62: [2022-12-01 21:42:28,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +36: [2022-12-01 21:42:28,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. +36: [2022-12-01 21:42:28,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt +36: [2022-12-01 21:42:28,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 5: [2022-12-01 21:42:28,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. + 5: [2022-12-01 21:42:28,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt + 5: [2022-12-01 21:42:28,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +24: [2022-12-01 21:42:28,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. +24: [2022-12-01 21:42:28,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt +24: [2022-12-01 21:42:28,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +50: [2022-12-01 21:42:28,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. +50: [2022-12-01 21:42:28,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt +50: [2022-12-01 21:42:28,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. +56: [2022-12-01 21:42:28,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt +56: [2022-12-01 21:42:28,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +48: [2022-12-01 21:42:28,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. +48: [2022-12-01 21:42:28,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt +48: [2022-12-01 21:42:28,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +13: [2022-12-01 21:42:28,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +13: [2022-12-01 21:42:28,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +58: [2022-12-01 21:42:28,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. +58: [2022-12-01 21:42:28,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt +58: [2022-12-01 21:42:28,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +43: [2022-12-01 21:42:28,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. +43: [2022-12-01 21:42:28,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt +43: [2022-12-01 21:42:28,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +37: [2022-12-01 21:42:28,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. +37: [2022-12-01 21:42:28,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt +37: [2022-12-01 21:42:28,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +16: [2022-12-01 21:42:28,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. +16: [2022-12-01 21:42:28,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt +16: [2022-12-01 21:42:28,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +25: [2022-12-01 21:42:28,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. +31: [2022-12-01 21:42:28,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. +25: [2022-12-01 21:42:28,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt +31: [2022-12-01 21:42:28,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt +31: [2022-12-01 21:42:28,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +25: [2022-12-01 21:42:28,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +21: [2022-12-01 21:42:28,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. +21: [2022-12-01 21:42:28,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt +21: [2022-12-01 21:42:28,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +61: [2022-12-01 21:42:28,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. +61: [2022-12-01 21:42:28,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt +61: [2022-12-01 21:42:28,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +49: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. +19: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. +49: [2022-12-01 21:42:28,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt +19: [2022-12-01 21:42:28,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt +49: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +19: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +30: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. + 6: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. +30: [2022-12-01 21:42:28,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt + 6: [2022-12-01 21:42:28,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt +30: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 6: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 8: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. + 8: [2022-12-01 21:42:28,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt + 8: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. + 8: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 8: [2022-12-01 21:42:28,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt + 8: [2022-12-01 21:42:28,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +14: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +21: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. +14: [2022-12-01 21:42:28,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +14: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +21: [2022-12-01 21:42:28,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt +21: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +29: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. + 5: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. + 5: [2022-12-01 21:42:28,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt + 5: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +20: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. +20: [2022-12-01 21:42:28,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt +20: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +17: [2022-12-01 21:42:28,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. +17: [2022-12-01 21:42:28,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt +17: [2022-12-01 21:42:28,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +47: [2022-12-01 21:42:28,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. +47: [2022-12-01 21:42:28,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt +47: [2022-12-01 21:42:28,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. + 9: [2022-12-01 21:42:28,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. + 9: [2022-12-01 21:42:28,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +44: [2022-12-01 21:42:28,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. + 9: [2022-12-01 21:42:28,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +44: [2022-12-01 21:42:28,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt +12: [2022-12-01 21:42:28,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +44: [2022-12-01 21:42:28,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +12: [2022-12-01 21:42:28,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +50: [2022-12-01 21:42:28,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. +12: [2022-12-01 21:42:28,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +50: [2022-12-01 21:42:28,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt +50: [2022-12-01 21:42:28,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +29: [2022-12-01 21:42:28,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt +29: [2022-12-01 21:42:28,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +56: [2022-12-01 21:42:28,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt +56: [2022-12-01 21:42:28,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +14: [2022-12-01 21:42:28,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +14: [2022-12-01 21:42:28,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +14: [2022-12-01 21:42:28,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +13: [2022-12-01 21:42:28,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +13: [2022-12-01 21:42:28,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +13: [2022-12-01 21:42:28,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 2: [2022-12-01 21:42:28,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. + 2: [2022-12-01 21:42:28,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt + 2: [2022-12-01 21:42:28,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +27: [2022-12-01 21:42:28,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. +27: [2022-12-01 21:42:28,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt +27: [2022-12-01 21:42:28,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +59: [2022-12-01 21:42:28,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. +26: [2022-12-01 21:42:28,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. +26: [2022-12-01 21:42:28,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt +26: [2022-12-01 21:42:28,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +62: [2022-12-01 21:42:28,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. +62: [2022-12-01 21:42:28,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt +62: [2022-12-01 21:42:28,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +48: [2022-12-01 21:42:28,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. +48: [2022-12-01 21:42:28,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt +48: [2022-12-01 21:42:28,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +22: [2022-12-01 21:42:28,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. +22: [2022-12-01 21:42:28,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt +22: [2022-12-01 21:42:28,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +39: [2022-12-01 21:42:28,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. +39: [2022-12-01 21:42:28,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt +39: [2022-12-01 21:42:28,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +23: [2022-12-01 21:42:28,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. +23: [2022-12-01 21:42:28,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt +23: [2022-12-01 21:42:28,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 4: [2022-12-01 21:42:28,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +59: [2022-12-01 21:42:28,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt + 4: [2022-12-01 21:42:28,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +59: [2022-12-01 21:42:28,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 4: [2022-12-01 21:42:28,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +15: [2022-12-01 21:42:28,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +15: [2022-12-01 21:42:28,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +29: [2022-12-01 21:42:28,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. +15: [2022-12-01 21:42:28,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +29: [2022-12-01 21:42:28,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt +29: [2022-12-01 21:42:28,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +12: [2022-12-01 21:42:28,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. +12: [2022-12-01 21:42:28,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt +12: [2022-12-01 21:42:28,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +60: [2022-12-01 21:42:28,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. +60: [2022-12-01 21:42:28,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt +60: [2022-12-01 21:42:28,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +36: [2022-12-01 21:42:28,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. +36: [2022-12-01 21:42:28,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt +36: [2022-12-01 21:42:28,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +60: [2022-12-01 21:42:28,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. +60: [2022-12-01 21:42:28,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt +60: [2022-12-01 21:42:28,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +10: [2022-12-01 21:42:28,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. +10: [2022-12-01 21:42:28,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt +10: [2022-12-01 21:42:28,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 4: [2022-12-01 21:42:28,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +55: [2022-12-01 21:42:28,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. + 4: [2022-12-01 21:42:28,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +55: [2022-12-01 21:42:28,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt + 4: [2022-12-01 21:42:28,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +55: [2022-12-01 21:42:28,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +14: [2022-12-01 21:42:28,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. +14: [2022-12-01 21:42:28,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt +14: [2022-12-01 21:42:28,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +47: [2022-12-01 21:42:28,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. +47: [2022-12-01 21:42:28,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt +47: [2022-12-01 21:42:28,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 1: [2022-12-01 21:42:28,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. + 1: [2022-12-01 21:42:28,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt + 1: [2022-12-01 21:42:28,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +49: [2022-12-01 21:42:28,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. +49: [2022-12-01 21:42:28,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt +49: [2022-12-01 21:42:28,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +17: [2022-12-01 21:42:28,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. +17: [2022-12-01 21:42:28,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt +17: [2022-12-01 21:42:28,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +59: [2022-12-01 21:42:28,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. +59: [2022-12-01 21:42:28,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt +59: [2022-12-01 21:42:28,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +49: [2022-12-01 21:42:28,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. +49: [2022-12-01 21:42:28,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt +49: [2022-12-01 21:42:28,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +49: [2022-12-01 21:42:28,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. +49: [2022-12-01 21:42:28,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt +49: [2022-12-01 21:42:28,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +44: [2022-12-01 21:42:28,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. +44: [2022-12-01 21:42:28,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt +44: [2022-12-01 21:42:28,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +11: [2022-12-01 21:42:28,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. +11: [2022-12-01 21:42:28,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt +11: [2022-12-01 21:42:28,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +18: [2022-12-01 21:42:28,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. +18: [2022-12-01 21:42:28,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt +18: [2022-12-01 21:42:28,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 1: [2022-12-01 21:42:28,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. + 1: [2022-12-01 21:42:28,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt + 1: [2022-12-01 21:42:28,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +24: [2022-12-01 21:42:28,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. +24: [2022-12-01 21:42:28,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step2000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt +24: [2022-12-01 21:42:28,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! + 0: successfully saved checkpoint at iteration 2000 to checkpoints_8b7beta +63: time (ms) | save-checkpoint: 6995.11 +63: iteration 2010/ 5494 | consumed samples: 2058240 | consumed tokens: 4215275520 | elapsed time per iteration (s): 6.56 | learning rate: 1.485E-04 | global batch size: 1024 | lm loss: 2.446432E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 156.002 | TFLOPs: 34.88 | +63: iteration 2020/ 5494 | consumed samples: 2068480 | consumed tokens: 4236247040 | elapsed time per iteration (s): 5.58 | learning rate: 1.480E-04 | global batch size: 1024 | lm loss: 2.437475E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.392 | TFLOPs: 41.00 | +63: iteration 2030/ 5494 | consumed samples: 2078720 | consumed tokens: 4257218560 | elapsed time per iteration (s): 5.69 | learning rate: 1.475E-04 | global batch size: 1024 | lm loss: 2.430424E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.842 | TFLOPs: 40.21 | +63: iteration 2040/ 5494 | consumed samples: 2088960 | consumed tokens: 4278190080 | elapsed time per iteration (s): 5.39 | learning rate: 1.470E-04 | global batch size: 1024 | lm loss: 2.441438E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.097 | TFLOPs: 42.50 | +63: iteration 2050/ 5494 | consumed samples: 2099200 | consumed tokens: 4299161600 | elapsed time per iteration (s): 5.53 | learning rate: 1.466E-04 | global batch size: 1024 | lm loss: 2.449977E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.259 | TFLOPs: 41.42 | +63: iteration 2060/ 5494 | consumed samples: 2109440 | consumed tokens: 4320133120 | elapsed time per iteration (s): 5.77 | learning rate: 1.461E-04 | global batch size: 1024 | lm loss: 2.426794E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.618 | TFLOPs: 39.71 | +63: iteration 2070/ 5494 | consumed samples: 2119680 | consumed tokens: 4341104640 | elapsed time per iteration (s): 5.51 | learning rate: 1.456E-04 | global batch size: 1024 | lm loss: 2.439267E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.755 | TFLOPs: 41.53 | +63: iteration 2080/ 5494 | consumed samples: 2129920 | consumed tokens: 4362076160 | elapsed time per iteration (s): 5.89 | learning rate: 1.451E-04 | global batch size: 1024 | lm loss: 2.452396E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.805 | TFLOPs: 38.86 | +63: iteration 2090/ 5494 | consumed samples: 2140160 | consumed tokens: 4383047680 | elapsed time per iteration (s): 5.69 | learning rate: 1.447E-04 | global batch size: 1024 | lm loss: 2.448587E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.986 | TFLOPs: 40.24 | +63: iteration 2100/ 5494 | consumed samples: 2150400 | consumed tokens: 4404019200 | elapsed time per iteration (s): 5.87 | learning rate: 1.442E-04 | global batch size: 1024 | lm loss: 2.409014E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.352 | TFLOPs: 38.98 | +63: iteration 2110/ 5494 | consumed samples: 2160640 | consumed tokens: 4424990720 | elapsed time per iteration (s): 5.81 | learning rate: 1.437E-04 | global batch size: 1024 | lm loss: 2.410908E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.177 | TFLOPs: 39.39 | +63: iteration 2120/ 5494 | consumed samples: 2170880 | consumed tokens: 4445962240 | elapsed time per iteration (s): 5.77 | learning rate: 1.432E-04 | global batch size: 1024 | lm loss: 2.421890E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.482 | TFLOPs: 39.68 | +63: iteration 2130/ 5494 | consumed samples: 2181120 | consumed tokens: 4466933760 | elapsed time per iteration (s): 5.74 | learning rate: 1.427E-04 | global batch size: 1024 | lm loss: 2.410042E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.270 | TFLOPs: 39.86 | +63: iteration 2140/ 5494 | consumed samples: 2191360 | consumed tokens: 4487905280 | elapsed time per iteration (s): 5.91 | learning rate: 1.423E-04 | global batch size: 1024 | lm loss: 2.410999E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.170 | TFLOPs: 38.72 | +63: iteration 2150/ 5494 | consumed samples: 2201600 | consumed tokens: 4508876800 | elapsed time per iteration (s): 5.62 | learning rate: 1.418E-04 | global batch size: 1024 | lm loss: 2.425050E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.126 | TFLOPs: 40.72 | +63: iteration 2160/ 5494 | consumed samples: 2211840 | consumed tokens: 4529848320 | elapsed time per iteration (s): 5.50 | learning rate: 1.413E-04 | global batch size: 1024 | lm loss: 2.406507E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.312 | TFLOPs: 41.65 | +63: iteration 2170/ 5494 | consumed samples: 2222080 | consumed tokens: 4550819840 | elapsed time per iteration (s): 5.74 | learning rate: 1.408E-04 | global batch size: 1024 | lm loss: 2.400420E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.470 | TFLOPs: 39.90 | +63: iteration 2180/ 5494 | consumed samples: 2232320 | consumed tokens: 4571791360 | elapsed time per iteration (s): 5.74 | learning rate: 1.403E-04 | global batch size: 1024 | lm loss: 2.389329E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.547 | TFLOPs: 39.92 | +63: iteration 2190/ 5494 | consumed samples: 2242560 | consumed tokens: 4592762880 | elapsed time per iteration (s): 5.90 | learning rate: 1.398E-04 | global batch size: 1024 | lm loss: 2.405150E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.628 | TFLOPs: 38.82 | +63: iteration 2200/ 5494 | consumed samples: 2252800 | consumed tokens: 4613734400 | elapsed time per iteration (s): 5.61 | learning rate: 1.393E-04 | global batch size: 1024 | lm loss: 2.389934E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.404 | TFLOPs: 40.78 | +63: iteration 2210/ 5494 | consumed samples: 2263040 | consumed tokens: 4634705920 | elapsed time per iteration (s): 5.73 | learning rate: 1.388E-04 | global batch size: 1024 | lm loss: 2.407726E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.643 | TFLOPs: 39.94 | +63: iteration 2220/ 5494 | consumed samples: 2273280 | consumed tokens: 4655677440 | elapsed time per iteration (s): 5.65 | learning rate: 1.383E-04 | global batch size: 1024 | lm loss: 2.394110E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.186 | TFLOPs: 40.51 | +63: iteration 2230/ 5494 | consumed samples: 2283520 | consumed tokens: 4676648960 | elapsed time per iteration (s): 5.54 | learning rate: 1.378E-04 | global batch size: 1024 | lm loss: 2.383247E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.854 | TFLOPs: 41.33 | +63: iteration 2240/ 5494 | consumed samples: 2293760 | consumed tokens: 4697620480 | elapsed time per iteration (s): 5.80 | learning rate: 1.373E-04 | global batch size: 1024 | lm loss: 2.385425E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.659 | TFLOPs: 39.50 | +63: iteration 2250/ 5494 | consumed samples: 2304000 | consumed tokens: 4718592000 | elapsed time per iteration (s): 5.59 | learning rate: 1.369E-04 | global batch size: 1024 | lm loss: 2.377464E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.101 | TFLOPs: 40.94 | +63: iteration 2260/ 5494 | consumed samples: 2314240 | consumed tokens: 4739563520 | elapsed time per iteration (s): 5.88 | learning rate: 1.364E-04 | global batch size: 1024 | lm loss: 2.394505E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.092 | TFLOPs: 38.92 | +63: iteration 2270/ 5494 | consumed samples: 2324480 | consumed tokens: 4760535040 | elapsed time per iteration (s): 5.97 | learning rate: 1.359E-04 | global batch size: 1024 | lm loss: 2.404538E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.668 | TFLOPs: 38.38 | +63: iteration 2280/ 5494 | consumed samples: 2334720 | consumed tokens: 4781506560 | elapsed time per iteration (s): 5.52 | learning rate: 1.354E-04 | global batch size: 1024 | lm loss: 2.381548E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.430 | TFLOPs: 41.46 | +63: iteration 2290/ 5494 | consumed samples: 2344960 | consumed tokens: 4802478080 | elapsed time per iteration (s): 5.52 | learning rate: 1.349E-04 | global batch size: 1024 | lm loss: 2.386615E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.670 | TFLOPs: 41.51 | +63: iteration 2300/ 5494 | consumed samples: 2355200 | consumed tokens: 4823449600 | elapsed time per iteration (s): 5.50 | learning rate: 1.344E-04 | global batch size: 1024 | lm loss: 2.400432E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.326 | TFLOPs: 41.66 | +63: iteration 2310/ 5494 | consumed samples: 2365440 | consumed tokens: 4844421120 | elapsed time per iteration (s): 5.74 | learning rate: 1.339E-04 | global batch size: 1024 | lm loss: 2.368637E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.333 | TFLOPs: 39.87 | +63: iteration 2320/ 5494 | consumed samples: 2375680 | consumed tokens: 4865392640 | elapsed time per iteration (s): 5.66 | learning rate: 1.334E-04 | global batch size: 1024 | lm loss: 2.390203E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.060 | TFLOPs: 40.48 | +63: iteration 2330/ 5494 | consumed samples: 2385920 | consumed tokens: 4886364160 | elapsed time per iteration (s): 5.79 | learning rate: 1.329E-04 | global batch size: 1024 | lm loss: 2.370954E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.856 | TFLOPs: 39.54 | +63: iteration 2340/ 5494 | consumed samples: 2396160 | consumed tokens: 4907335680 | elapsed time per iteration (s): 5.51 | learning rate: 1.324E-04 | global batch size: 1024 | lm loss: 2.379815E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.705 | TFLOPs: 41.52 | +63: iteration 2350/ 5494 | consumed samples: 2406400 | consumed tokens: 4928307200 | elapsed time per iteration (s): 5.50 | learning rate: 1.318E-04 | global batch size: 1024 | lm loss: 2.370548E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.262 | TFLOPs: 41.64 | +63: iteration 2360/ 5494 | consumed samples: 2416640 | consumed tokens: 4949278720 | elapsed time per iteration (s): 5.41 | learning rate: 1.313E-04 | global batch size: 1024 | lm loss: 2.367479E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.424 | TFLOPs: 42.35 | +63: iteration 2370/ 5494 | consumed samples: 2426880 | consumed tokens: 4970250240 | elapsed time per iteration (s): 5.59 | learning rate: 1.308E-04 | global batch size: 1024 | lm loss: 2.383711E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.164 | TFLOPs: 40.95 | +63: iteration 2380/ 5494 | consumed samples: 2437120 | consumed tokens: 4991221760 | elapsed time per iteration (s): 5.52 | learning rate: 1.303E-04 | global batch size: 1024 | lm loss: 2.359937E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.544 | TFLOPs: 41.48 | +63: iteration 2390/ 5494 | consumed samples: 2447360 | consumed tokens: 5012193280 | elapsed time per iteration (s): 5.86 | learning rate: 1.298E-04 | global batch size: 1024 | lm loss: 2.381984E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.865 | TFLOPs: 39.09 | +63: iteration 2400/ 5494 | consumed samples: 2457600 | consumed tokens: 5033164800 | elapsed time per iteration (s): 5.73 | learning rate: 1.293E-04 | global batch size: 1024 | lm loss: 2.341963E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.856 | TFLOPs: 39.99 | +63: iteration 2410/ 5494 | consumed samples: 2467840 | consumed tokens: 5054136320 | elapsed time per iteration (s): 5.41 | learning rate: 1.288E-04 | global batch size: 1024 | lm loss: 2.365147E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.279 | TFLOPs: 42.32 | +63: iteration 2420/ 5494 | consumed samples: 2478080 | consumed tokens: 5075107840 | elapsed time per iteration (s): 5.61 | learning rate: 1.283E-04 | global batch size: 1024 | lm loss: 2.359129E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.626 | TFLOPs: 40.83 | +63: iteration 2430/ 5494 | consumed samples: 2488320 | consumed tokens: 5096079360 | elapsed time per iteration (s): 5.51 | learning rate: 1.278E-04 | global batch size: 1024 | lm loss: 2.351435E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.792 | TFLOPs: 41.54 | +63: iteration 2440/ 5494 | consumed samples: 2498560 | consumed tokens: 5117050880 | elapsed time per iteration (s): 5.85 | learning rate: 1.273E-04 | global batch size: 1024 | lm loss: 2.366449E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.022 | TFLOPs: 39.13 | +63: iteration 2450/ 5494 | consumed samples: 2508800 | consumed tokens: 5138022400 | elapsed time per iteration (s): 5.65 | learning rate: 1.268E-04 | global batch size: 1024 | lm loss: 2.362822E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.337 | TFLOPs: 40.54 | +63: iteration 2460/ 5494 | consumed samples: 2519040 | consumed tokens: 5158993920 | elapsed time per iteration (s): 5.77 | learning rate: 1.263E-04 | global batch size: 1024 | lm loss: 2.345971E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.430 | TFLOPs: 39.67 | +63: iteration 2470/ 5494 | consumed samples: 2529280 | consumed tokens: 5179965440 | elapsed time per iteration (s): 5.72 | learning rate: 1.257E-04 | global batch size: 1024 | lm loss: 2.349396E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.924 | TFLOPs: 40.00 | +63: iteration 2480/ 5494 | consumed samples: 2539520 | consumed tokens: 5200936960 | elapsed time per iteration (s): 5.76 | learning rate: 1.252E-04 | global batch size: 1024 | lm loss: 2.375977E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.804 | TFLOPs: 39.75 | +63: iteration 2490/ 5494 | consumed samples: 2549760 | consumed tokens: 5221908480 | elapsed time per iteration (s): 5.77 | learning rate: 1.247E-04 | global batch size: 1024 | lm loss: 2.345479E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.323 | TFLOPs: 39.64 | +63: iteration 2500/ 5494 | consumed samples: 2560000 | consumed tokens: 5242880000 | elapsed time per iteration (s): 5.38 | learning rate: 1.242E-04 | global batch size: 1024 | lm loss: 2.357598E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.364 | TFLOPs: 42.56 | +63: iteration 2510/ 5494 | consumed samples: 2570240 | consumed tokens: 5263851520 | elapsed time per iteration (s): 5.89 | learning rate: 1.237E-04 | global batch size: 1024 | lm loss: 2.362493E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.995 | TFLOPs: 38.90 | +63: iteration 2520/ 5494 | consumed samples: 2580480 | consumed tokens: 5284823040 | elapsed time per iteration (s): 5.50 | learning rate: 1.232E-04 | global batch size: 1024 | lm loss: 2.340778E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.072 | TFLOPs: 41.60 | +63: iteration 2530/ 5494 | consumed samples: 2590720 | consumed tokens: 5305794560 | elapsed time per iteration (s): 5.48 | learning rate: 1.227E-04 | global batch size: 1024 | lm loss: 2.333778E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.795 | TFLOPs: 41.76 | +63: iteration 2540/ 5494 | consumed samples: 2600960 | consumed tokens: 5326766080 | elapsed time per iteration (s): 5.39 | learning rate: 1.222E-04 | global batch size: 1024 | lm loss: 2.340576E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.138 | TFLOPs: 42.51 | +63: iteration 2550/ 5494 | consumed samples: 2611200 | consumed tokens: 5347737600 | elapsed time per iteration (s): 5.61 | learning rate: 1.216E-04 | global batch size: 1024 | lm loss: 2.346003E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.564 | TFLOPs: 40.82 | +63: iteration 2560/ 5494 | consumed samples: 2621440 | consumed tokens: 5368709120 | elapsed time per iteration (s): 5.85 | learning rate: 1.211E-04 | global batch size: 1024 | lm loss: 2.335997E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.953 | TFLOPs: 39.11 | +63: iteration 2570/ 5494 | consumed samples: 2631680 | consumed tokens: 5389680640 | elapsed time per iteration (s): 5.75 | learning rate: 1.206E-04 | global batch size: 1024 | lm loss: 2.335221E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.994 | TFLOPs: 39.79 | +63: iteration 2580/ 5494 | consumed samples: 2641920 | consumed tokens: 5410652160 | elapsed time per iteration (s): 5.66 | learning rate: 1.201E-04 | global batch size: 1024 | lm loss: 2.345366E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.038 | TFLOPs: 40.47 | +63: iteration 2590/ 5494 | consumed samples: 2652160 | consumed tokens: 5431623680 | elapsed time per iteration (s): 5.50 | learning rate: 1.196E-04 | global batch size: 1024 | lm loss: 2.341470E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.137 | TFLOPs: 41.61 | +63: iteration 2600/ 5494 | consumed samples: 2662400 | consumed tokens: 5452595200 | elapsed time per iteration (s): 6.01 | learning rate: 1.191E-04 | global batch size: 1024 | lm loss: 2.328492E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.331 | TFLOPs: 38.08 | +63: iteration 2610/ 5494 | consumed samples: 2672640 | consumed tokens: 5473566720 | elapsed time per iteration (s): 5.49 | learning rate: 1.185E-04 | global batch size: 1024 | lm loss: 2.334725E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.558 | TFLOPs: 41.71 | +63: iteration 2620/ 5494 | consumed samples: 2682880 | consumed tokens: 5494538240 | elapsed time per iteration (s): 5.51 | learning rate: 1.180E-04 | global batch size: 1024 | lm loss: 2.325007E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.762 | TFLOPs: 41.53 | +63: iteration 2630/ 5494 | consumed samples: 2693120 | consumed tokens: 5515509760 | elapsed time per iteration (s): 5.74 | learning rate: 1.175E-04 | global batch size: 1024 | lm loss: 2.338852E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.336 | TFLOPs: 39.87 | +63: iteration 2640/ 5494 | consumed samples: 2703360 | consumed tokens: 5536481280 | elapsed time per iteration (s): 5.52 | learning rate: 1.170E-04 | global batch size: 1024 | lm loss: 2.317383E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.426 | TFLOPs: 41.46 | +63: iteration 2650/ 5494 | consumed samples: 2713600 | consumed tokens: 5557452800 | elapsed time per iteration (s): 5.71 | learning rate: 1.165E-04 | global batch size: 1024 | lm loss: 2.324677E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.456 | TFLOPs: 40.12 | +63: iteration 2660/ 5494 | consumed samples: 2723840 | consumed tokens: 5578424320 | elapsed time per iteration (s): 5.68 | learning rate: 1.159E-04 | global batch size: 1024 | lm loss: 2.307653E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.337 | TFLOPs: 40.32 | +63: iteration 2670/ 5494 | consumed samples: 2734080 | consumed tokens: 5599395840 | elapsed time per iteration (s): 5.50 | learning rate: 1.154E-04 | global batch size: 1024 | lm loss: 2.328857E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.140 | TFLOPs: 41.61 | +63: iteration 2680/ 5494 | consumed samples: 2744320 | consumed tokens: 5620367360 | elapsed time per iteration (s): 5.74 | learning rate: 1.149E-04 | global batch size: 1024 | lm loss: 2.334591E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.469 | TFLOPs: 39.90 | +63: iteration 2690/ 5494 | consumed samples: 2754560 | consumed tokens: 5641338880 | elapsed time per iteration (s): 5.69 | learning rate: 1.144E-04 | global batch size: 1024 | lm loss: 2.313636E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.943 | TFLOPs: 40.23 | +63: iteration 2700/ 5494 | consumed samples: 2764800 | consumed tokens: 5662310400 | elapsed time per iteration (s): 5.83 | learning rate: 1.139E-04 | global batch size: 1024 | lm loss: 2.296777E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.735 | TFLOPs: 39.29 | +63: iteration 2710/ 5494 | consumed samples: 2775040 | consumed tokens: 5683281920 | elapsed time per iteration (s): 5.67 | learning rate: 1.134E-04 | global batch size: 1024 | lm loss: 2.308400E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.624 | TFLOPs: 40.38 | +63: iteration 2720/ 5494 | consumed samples: 2785280 | consumed tokens: 5704253440 | elapsed time per iteration (s): 5.73 | learning rate: 1.128E-04 | global batch size: 1024 | lm loss: 2.315931E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.842 | TFLOPs: 39.98 | +63: iteration 2730/ 5494 | consumed samples: 2795520 | consumed tokens: 5725224960 | elapsed time per iteration (s): 5.54 | learning rate: 1.123E-04 | global batch size: 1024 | lm loss: 2.308488E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.000 | TFLOPs: 41.36 | +63: iteration 2740/ 5494 | consumed samples: 2805760 | consumed tokens: 5746196480 | elapsed time per iteration (s): 5.67 | learning rate: 1.118E-04 | global batch size: 1024 | lm loss: 2.298047E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.454 | TFLOPs: 40.34 | +63: iteration 2750/ 5494 | consumed samples: 2816000 | consumed tokens: 5767168000 | elapsed time per iteration (s): 5.77 | learning rate: 1.113E-04 | global batch size: 1024 | lm loss: 2.310855E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.458 | TFLOPs: 39.67 | +63: iteration 2760/ 5494 | consumed samples: 2826240 | consumed tokens: 5788139520 | elapsed time per iteration (s): 5.52 | learning rate: 1.108E-04 | global batch size: 1024 | lm loss: 2.304008E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.530 | TFLOPs: 41.48 | +63: iteration 2770/ 5494 | consumed samples: 2836480 | consumed tokens: 5809111040 | elapsed time per iteration (s): 5.43 | learning rate: 1.102E-04 | global batch size: 1024 | lm loss: 2.304182E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.424 | TFLOPs: 42.13 | +63: iteration 2780/ 5494 | consumed samples: 2846720 | consumed tokens: 5830082560 | elapsed time per iteration (s): 5.83 | learning rate: 1.097E-04 | global batch size: 1024 | lm loss: 2.314465E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.569 | TFLOPs: 39.25 | +63: iteration 2790/ 5494 | consumed samples: 2856960 | consumed tokens: 5851054080 | elapsed time per iteration (s): 5.76 | learning rate: 1.092E-04 | global batch size: 1024 | lm loss: 2.296543E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.789 | TFLOPs: 39.75 | +63: iteration 2800/ 5494 | consumed samples: 2867200 | consumed tokens: 5872025600 | elapsed time per iteration (s): 5.53 | learning rate: 1.087E-04 | global batch size: 1024 | lm loss: 2.300083E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.108 | TFLOPs: 41.38 | +63: iteration 2810/ 5494 | consumed samples: 2877440 | consumed tokens: 5892997120 | elapsed time per iteration (s): 5.54 | learning rate: 1.082E-04 | global batch size: 1024 | lm loss: 2.316658E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.683 | TFLOPs: 41.29 | +63: iteration 2820/ 5494 | consumed samples: 2887680 | consumed tokens: 5913968640 | elapsed time per iteration (s): 5.80 | learning rate: 1.076E-04 | global batch size: 1024 | lm loss: 2.311092E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.588 | TFLOPs: 39.48 | +63: iteration 2830/ 5494 | consumed samples: 2897920 | consumed tokens: 5934940160 | elapsed time per iteration (s): 5.65 | learning rate: 1.071E-04 | global batch size: 1024 | lm loss: 2.286285E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.264 | TFLOPs: 40.52 | +63: iteration 2840/ 5494 | consumed samples: 2908160 | consumed tokens: 5955911680 | elapsed time per iteration (s): 5.75 | learning rate: 1.066E-04 | global batch size: 1024 | lm loss: 2.302586E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.957 | TFLOPs: 39.79 | +63: iteration 2850/ 5494 | consumed samples: 2918400 | consumed tokens: 5976883200 | elapsed time per iteration (s): 5.64 | learning rate: 1.061E-04 | global batch size: 1024 | lm loss: 2.302692E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.422 | TFLOPs: 40.56 | +63: iteration 2860/ 5494 | consumed samples: 2928640 | consumed tokens: 5997854720 | elapsed time per iteration (s): 6.00 | learning rate: 1.056E-04 | global batch size: 1024 | lm loss: 2.282582E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.535 | TFLOPs: 38.13 | +63: iteration 2870/ 5494 | consumed samples: 2938880 | consumed tokens: 6018826240 | elapsed time per iteration (s): 5.74 | learning rate: 1.050E-04 | global batch size: 1024 | lm loss: 2.297619E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.382 | TFLOPs: 39.88 | +63: iteration 2880/ 5494 | consumed samples: 2949120 | consumed tokens: 6039797760 | elapsed time per iteration (s): 5.52 | learning rate: 1.045E-04 | global batch size: 1024 | lm loss: 2.306599E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.380 | TFLOPs: 41.44 | +63: iteration 2890/ 5494 | consumed samples: 2959360 | consumed tokens: 6060769280 | elapsed time per iteration (s): 5.64 | learning rate: 1.040E-04 | global batch size: 1024 | lm loss: 2.295145E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.684 | TFLOPs: 40.62 | +63: iteration 2900/ 5494 | consumed samples: 2969600 | consumed tokens: 6081740800 | elapsed time per iteration (s): 5.74 | learning rate: 1.035E-04 | global batch size: 1024 | lm loss: 2.288017E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.527 | TFLOPs: 39.91 | +63: iteration 2910/ 5494 | consumed samples: 2979840 | consumed tokens: 6102712320 | elapsed time per iteration (s): 5.88 | learning rate: 1.030E-04 | global batch size: 1024 | lm loss: 2.303008E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.161 | TFLOPs: 38.94 | +63: iteration 2920/ 5494 | consumed samples: 2990080 | consumed tokens: 6123683840 | elapsed time per iteration (s): 5.76 | learning rate: 1.024E-04 | global batch size: 1024 | lm loss: 2.285459E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.641 | TFLOPs: 39.71 | +63: iteration 2930/ 5494 | consumed samples: 3000320 | consumed tokens: 6144655360 | elapsed time per iteration (s): 5.94 | learning rate: 1.019E-04 | global batch size: 1024 | lm loss: 2.285719E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.377 | TFLOPs: 38.54 | +63: iteration 2940/ 5494 | consumed samples: 3010560 | consumed tokens: 6165626880 | elapsed time per iteration (s): 5.41 | learning rate: 1.014E-04 | global batch size: 1024 | lm loss: 2.281183E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.106 | TFLOPs: 42.28 | +63: iteration 2950/ 5494 | consumed samples: 3020800 | consumed tokens: 6186598400 | elapsed time per iteration (s): 5.39 | learning rate: 1.009E-04 | global batch size: 1024 | lm loss: 2.283249E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.992 | TFLOPs: 42.48 | +63: iteration 2960/ 5494 | consumed samples: 3031040 | consumed tokens: 6207569920 | elapsed time per iteration (s): 5.70 | learning rate: 1.004E-04 | global batch size: 1024 | lm loss: 2.263688E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.781 | TFLOPs: 40.19 | +63: iteration 2970/ 5494 | consumed samples: 3041280 | consumed tokens: 6228541440 | elapsed time per iteration (s): 5.51 | learning rate: 9.986E-05 | global batch size: 1024 | lm loss: 2.273630E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.712 | TFLOPs: 41.52 | +63: iteration 2980/ 5494 | consumed samples: 3051520 | consumed tokens: 6249512960 | elapsed time per iteration (s): 5.66 | learning rate: 9.934E-05 | global batch size: 1024 | lm loss: 2.288673E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.067 | TFLOPs: 40.48 | +63: iteration 2990/ 5494 | consumed samples: 3061760 | consumed tokens: 6270484480 | elapsed time per iteration (s): 5.85 | learning rate: 9.883E-05 | global batch size: 1024 | lm loss: 2.283151E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.189 | TFLOPs: 39.17 | +63: iteration 3000/ 5494 | consumed samples: 3072000 | consumed tokens: 6291456000 | elapsed time per iteration (s): 5.85 | learning rate: 9.831E-05 | global batch size: 1024 | lm loss: 2.292481E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.996 | TFLOPs: 39.12 | +63: ------------------------------------------------------------------------------------------ +63: valid loss at iteration 3000 | lm loss value: 2.226605E+00 | lm loss PPL: 9.268344E+00 | +63: ------------------------------------------------------------------------------------------ + 0: saving checkpoint at iteration 3000 to checkpoints_8b7beta + 0: [2022-12-01 23:16:58,973] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +32: [2022-12-01 23:16:59,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_24-model_00-model_states.pt... +32: [2022-12-01 23:16:59,107] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_24-model_01-model_states.pt... + 0: [2022-12-01 23:16:59,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_01-model_00-model_states.pt... + 0: [2022-12-01 23:16:59,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_01-model_01-model_states.pt... +32: [2022-12-01 23:16:59,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_24-model_01-model_states.pt. +32: [2022-12-01 23:16:59,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_24-model_00-model_states.pt. + 0: [2022-12-01 23:16:59,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_01-model_01-model_states.pt. + 0: [2022-12-01 23:16:59,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_01-model_00-model_states.pt. +32: [2022-12-01 23:16:59,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_25-model_01-model_states.pt... +32: [2022-12-01 23:16:59,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_25-model_00-model_states.pt... + 0: [2022-12-01 23:16:59,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_03-model_01-model_states.pt... + 0: [2022-12-01 23:16:59,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_03-model_00-model_states.pt... + 0: [2022-12-01 23:16:59,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_03-model_01-model_states.pt. + 0: [2022-12-01 23:16:59,865] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_04-model_01-model_states.pt... +32: [2022-12-01 23:16:59,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_25-model_01-model_states.pt. +32: [2022-12-01 23:16:59,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_26-model_01-model_states.pt... +32: [2022-12-01 23:16:59,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_25-model_00-model_states.pt. +32: [2022-12-01 23:16:59,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_26-model_00-model_states.pt... + 0: [2022-12-01 23:16:59,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_03-model_00-model_states.pt. + 0: [2022-12-01 23:16:59,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_04-model_00-model_states.pt... + 0: [2022-12-01 23:17:00,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_04-model_01-model_states.pt. + 0: [2022-12-01 23:17:00,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_05-model_01-model_states.pt... +32: [2022-12-01 23:17:00,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_26-model_01-model_states.pt. +32: [2022-12-01 23:17:00,134] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_27-model_01-model_states.pt... + 0: [2022-12-01 23:17:00,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_04-model_00-model_states.pt. + 0: [2022-12-01 23:17:00,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_05-model_00-model_states.pt... +32: [2022-12-01 23:17:00,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_26-model_00-model_states.pt. +32: [2022-12-01 23:17:00,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_27-model_00-model_states.pt... + 0: [2022-12-01 23:17:00,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_05-model_01-model_states.pt. + 0: [2022-12-01 23:17:00,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_06-model_01-model_states.pt... + 0: [2022-12-01 23:17:00,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_05-model_00-model_states.pt. + 0: [2022-12-01 23:17:00,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_06-model_00-model_states.pt... +32: [2022-12-01 23:17:00,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_27-model_01-model_states.pt. +32: [2022-12-01 23:17:00,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_28-model_01-model_states.pt... +32: [2022-12-01 23:17:00,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_27-model_00-model_states.pt. +32: [2022-12-01 23:17:00,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_28-model_00-model_states.pt... + 0: [2022-12-01 23:17:00,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_06-model_01-model_states.pt. + 0: [2022-12-01 23:17:00,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_07-model_01-model_states.pt... +32: [2022-12-01 23:17:00,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_28-model_01-model_states.pt. +32: [2022-12-01 23:17:00,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_29-model_01-model_states.pt... + 0: [2022-12-01 23:17:00,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_06-model_00-model_states.pt. + 0: [2022-12-01 23:17:00,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_07-model_00-model_states.pt... +32: [2022-12-01 23:17:00,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_28-model_00-model_states.pt. +32: [2022-12-01 23:17:00,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_29-model_00-model_states.pt... + 0: [2022-12-01 23:17:00,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_07-model_01-model_states.pt. + 0: [2022-12-01 23:17:00,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_08-model_01-model_states.pt... +32: [2022-12-01 23:17:00,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_29-model_01-model_states.pt. +32: [2022-12-01 23:17:00,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_30-model_01-model_states.pt... + 0: [2022-12-01 23:17:00,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_07-model_00-model_states.pt. + 0: [2022-12-01 23:17:00,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_08-model_00-model_states.pt... +32: [2022-12-01 23:17:00,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_29-model_00-model_states.pt. +32: [2022-12-01 23:17:00,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_30-model_00-model_states.pt... + 0: [2022-12-01 23:17:01,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_08-model_01-model_states.pt. + 0: [2022-12-01 23:17:01,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_09-model_01-model_states.pt... +32: [2022-12-01 23:17:01,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_30-model_01-model_states.pt. +32: [2022-12-01 23:17:01,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_31-model_01-model_states.pt... +32: [2022-12-01 23:17:01,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_30-model_00-model_states.pt. +32: [2022-12-01 23:17:01,101] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_31-model_00-model_states.pt... + 0: [2022-12-01 23:17:01,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_08-model_00-model_states.pt. + 0: [2022-12-01 23:17:01,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_09-model_00-model_states.pt... + 0: [2022-12-01 23:17:01,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_09-model_01-model_states.pt. + 0: [2022-12-01 23:17:01,286] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_10-model_01-model_states.pt... +32: [2022-12-01 23:17:01,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_31-model_01-model_states.pt. +32: [2022-12-01 23:17:01,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_32-model_01-model_states.pt... +32: [2022-12-01 23:17:01,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_31-model_00-model_states.pt. +32: [2022-12-01 23:17:01,339] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_32-model_00-model_states.pt... + 0: [2022-12-01 23:17:01,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_09-model_00-model_states.pt. + 0: [2022-12-01 23:17:01,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_10-model_00-model_states.pt... + 0: [2022-12-01 23:17:01,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_10-model_01-model_states.pt. + 0: [2022-12-01 23:17:01,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_11-model_01-model_states.pt... +32: [2022-12-01 23:17:01,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_32-model_01-model_states.pt. +32: [2022-12-01 23:17:01,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_33-model_01-model_states.pt... +32: [2022-12-01 23:17:01,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_32-model_00-model_states.pt. +32: [2022-12-01 23:17:01,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_33-model_00-model_states.pt... + 0: [2022-12-01 23:17:01,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_10-model_00-model_states.pt. + 0: [2022-12-01 23:17:01,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_11-model_00-model_states.pt... +32: [2022-12-01 23:17:01,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_33-model_01-model_states.pt. +32: [2022-12-01 23:17:01,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_34-model_01-model_states.pt... + 0: [2022-12-01 23:17:01,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_11-model_01-model_states.pt. + 0: [2022-12-01 23:17:01,784] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_12-model_01-model_states.pt... +32: [2022-12-01 23:17:01,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_33-model_00-model_states.pt. +32: [2022-12-01 23:17:01,792] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_34-model_00-model_states.pt... + 0: [2022-12-01 23:17:01,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_11-model_00-model_states.pt. + 0: [2022-12-01 23:17:01,860] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_12-model_00-model_states.pt... +32: [2022-12-01 23:17:02,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_34-model_01-model_states.pt. +32: [2022-12-01 23:17:02,006] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_35-model_01-model_states.pt... +32: [2022-12-01 23:17:02,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_34-model_00-model_states.pt. +32: [2022-12-01 23:17:02,027] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_35-model_00-model_states.pt... + 0: [2022-12-01 23:17:02,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_12-model_01-model_states.pt. + 0: [2022-12-01 23:17:02,036] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_13-model_01-model_states.pt... + 0: [2022-12-01 23:17:02,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_12-model_00-model_states.pt. + 0: [2022-12-01 23:17:02,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_13-model_00-model_states.pt... +32: [2022-12-01 23:17:02,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_35-model_00-model_states.pt. +32: [2022-12-01 23:17:02,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_35-model_01-model_states.pt. +32: [2022-12-01 23:17:02,260] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_36-model_01-model_states.pt... +32: [2022-12-01 23:17:02,260] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_36-model_00-model_states.pt... + 0: [2022-12-01 23:17:02,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_13-model_01-model_states.pt. + 0: [2022-12-01 23:17:02,265] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_14-model_01-model_states.pt... + 0: [2022-12-01 23:17:02,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_13-model_00-model_states.pt. + 0: [2022-12-01 23:17:02,371] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_14-model_00-model_states.pt... + 0: [2022-12-01 23:17:02,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_14-model_01-model_states.pt. + 0: [2022-12-01 23:17:02,498] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_15-model_01-model_states.pt... +32: [2022-12-01 23:17:02,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_36-model_00-model_states.pt. +32: [2022-12-01 23:17:02,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_36-model_01-model_states.pt. +32: [2022-12-01 23:17:02,533] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_37-model_00-model_states.pt... +32: [2022-12-01 23:17:02,533] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_37-model_01-model_states.pt... + 0: [2022-12-01 23:17:02,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_14-model_00-model_states.pt. + 0: [2022-12-01 23:17:02,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_15-model_00-model_states.pt... + 0: [2022-12-01 23:17:02,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_15-model_01-model_states.pt. + 0: [2022-12-01 23:17:02,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_16-model_01-model_states.pt... +32: [2022-12-01 23:17:02,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_37-model_00-model_states.pt. +32: [2022-12-01 23:17:02,775] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_38-model_00-model_states.pt... +32: [2022-12-01 23:17:02,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_37-model_01-model_states.pt. +32: [2022-12-01 23:17:02,782] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_38-model_01-model_states.pt... + 0: [2022-12-01 23:17:02,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_15-model_00-model_states.pt. + 0: [2022-12-01 23:17:02,869] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_16-model_00-model_states.pt... + 0: [2022-12-01 23:17:02,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_16-model_01-model_states.pt. + 0: [2022-12-01 23:17:02,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_17-model_01-model_states.pt... +32: [2022-12-01 23:17:03,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_38-model_01-model_states.pt. +32: [2022-12-01 23:17:03,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_39-model_01-model_states.pt... +32: [2022-12-01 23:17:03,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_38-model_00-model_states.pt. +32: [2022-12-01 23:17:03,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_39-model_00-model_states.pt... + 0: [2022-12-01 23:17:03,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_16-model_00-model_states.pt. + 0: [2022-12-01 23:17:03,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_17-model_00-model_states.pt... + 0: [2022-12-01 23:17:03,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_17-model_01-model_states.pt. + 0: [2022-12-01 23:17:03,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_18-model_01-model_states.pt... +32: [2022-12-01 23:17:03,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_39-model_01-model_states.pt. +32: [2022-12-01 23:17:03,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_40-model_01-model_states.pt... +32: [2022-12-01 23:17:03,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_39-model_00-model_states.pt. +32: [2022-12-01 23:17:03,262] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_40-model_00-model_states.pt... + 0: [2022-12-01 23:17:03,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_17-model_00-model_states.pt. + 0: [2022-12-01 23:17:03,370] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_18-model_00-model_states.pt... + 0: [2022-12-01 23:17:03,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_18-model_01-model_states.pt. + 0: [2022-12-01 23:17:03,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_19-model_01-model_states.pt... +32: [2022-12-01 23:17:03,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_40-model_00-model_states.pt. +32: [2022-12-01 23:17:03,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_41-model_00-model_states.pt... +32: [2022-12-01 23:17:03,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_40-model_01-model_states.pt. +32: [2022-12-01 23:17:03,506] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_41-model_01-model_states.pt... + 0: [2022-12-01 23:17:03,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_18-model_00-model_states.pt. + 0: [2022-12-01 23:17:03,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_19-model_00-model_states.pt... + 0: [2022-12-01 23:17:03,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_19-model_01-model_states.pt. + 0: [2022-12-01 23:17:03,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_20-model_01-model_states.pt... +32: [2022-12-01 23:17:03,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_41-model_00-model_states.pt. +32: [2022-12-01 23:17:03,754] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_42-model_00-model_states.pt... +32: [2022-12-01 23:17:03,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_41-model_01-model_states.pt. +32: [2022-12-01 23:17:03,762] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_42-model_01-model_states.pt... + 0: [2022-12-01 23:17:03,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_19-model_00-model_states.pt. + 0: [2022-12-01 23:17:03,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_20-model_00-model_states.pt... + 0: [2022-12-01 23:17:03,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_20-model_01-model_states.pt. + 0: [2022-12-01 23:17:03,872] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_21-model_01-model_states.pt... +32: [2022-12-01 23:17:03,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_42-model_00-model_states.pt. +32: [2022-12-01 23:17:03,996] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_43-model_00-model_states.pt... +32: [2022-12-01 23:17:04,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_42-model_01-model_states.pt. +32: [2022-12-01 23:17:04,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_43-model_01-model_states.pt... + 0: [2022-12-01 23:17:04,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_21-model_01-model_states.pt. + 0: [2022-12-01 23:17:04,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_22-model_01-model_states.pt... + 0: [2022-12-01 23:17:04,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_20-model_00-model_states.pt. + 0: [2022-12-01 23:17:04,105] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_21-model_00-model_states.pt... +32: [2022-12-01 23:17:04,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_43-model_00-model_states.pt. +32: [2022-12-01 23:17:04,246] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_44-model_00-model_states.pt... +32: [2022-12-01 23:17:04,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_43-model_01-model_states.pt. +32: [2022-12-01 23:17:04,254] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_44-model_01-model_states.pt... + 0: [2022-12-01 23:17:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_21-model_00-model_states.pt. + 0: [2022-12-01 23:17:04,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_22-model_00-model_states.pt... + 0: [2022-12-01 23:17:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_22-model_01-model_states.pt. + 0: [2022-12-01 23:17:04,357] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_23-model_01-model_states.pt... +32: [2022-12-01 23:17:04,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_44-model_00-model_states.pt. +32: [2022-12-01 23:17:04,476] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_46-model_00-model_states.pt... +32: [2022-12-01 23:17:04,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_44-model_01-model_states.pt. +32: [2022-12-01 23:17:04,492] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_46-model_01-model_states.pt... +32: [2022-12-01 23:17:04,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_46-model_00-model_states.pt. +32: [2022-12-01 23:17:04,506] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/mp_rank_02_model_states.pt... +32: [2022-12-01 23:17:04,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_46-model_01-model_states.pt. +32: [2022-12-01 23:17:04,507] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/mp_rank_03_model_states.pt... +32: [2022-12-01 23:17:04,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/mp_rank_02_model_states.pt. +32: [2022-12-01 23:17:04,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/mp_rank_03_model_states.pt. + 0: [2022-12-01 23:17:04,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_23-model_01-model_states.pt. + 0: [2022-12-01 23:17:04,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_22-model_00-model_states.pt. + 0: [2022-12-01 23:17:04,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/layer_23-model_00-model_states.pt... + 0: [2022-12-01 23:17:04,603] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7beta/global_step3000/mp_rank_01_model_states.pt + 0: [2022-12-01 23:17:04,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/mp_rank_01_model_states.pt... + 0: [2022-12-01 23:17:04,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/mp_rank_01_model_states.pt. + 0: [2022-12-01 23:17:04,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/layer_23-model_00-model_states.pt. + 0: [2022-12-01 23:17:04,841] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7beta/global_step3000/mp_rank_00_model_states.pt + 0: [2022-12-01 23:17:04,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/mp_rank_00_model_states.pt... + 0: [2022-12-01 23:17:04,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/mp_rank_00_model_states.pt. +32: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... +32: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... + 0: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... + 0: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... +54: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... +54: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... +54: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... +54: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... +56: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... +56: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... +56: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... +62: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... +62: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... +60: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... +60: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... +60: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... +60: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... +60: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... +51: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... +51: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... +51: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... +51: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... +63: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... +63: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... +63: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... +63: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... +39: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... +39: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... +39: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... +39: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... + 7: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... + 7: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... +57: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... +57: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... +27: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... +42: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... +42: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... +42: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... +42: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... +58: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... +58: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... +58: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... +58: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... +34: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... +34: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... +34: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... +28: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... +32: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... +50: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... +50: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... +46: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... +46: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... +46: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... +46: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... +46: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... +49: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... +49: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... +49: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... +10: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... +37: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... +37: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... +37: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... +37: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... +37: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... + 8: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... + 8: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... +52: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... +52: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... +52: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... +61: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... +61: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... +45: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... +45: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... +45: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... +45: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... +47: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... +47: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... +47: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... +47: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... +47: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... +59: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... +59: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... +59: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... +59: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... +35: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... +35: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... +35: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... +19: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... +48: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... +48: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... +48: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... +53: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... +53: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... +53: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... +53: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... +40: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... +40: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... +40: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... +40: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... +16: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... + 0: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... + 0: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +54: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... +54: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... +54: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... +54: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... +56: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... +56: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... +56: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... +62: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... +62: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... +62: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... + 4: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... +60: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... +60: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... +51: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... +51: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... +51: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... +51: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... +63: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... +63: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... +63: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... +17: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... +17: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... +17: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... + 1: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... + 1: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... +39: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... + 7: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... + 7: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... + 7: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... +11: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +11: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... +57: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... +57: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... +27: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... +43: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... +43: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... +25: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... +13: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... +14: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... +38: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... +38: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... +38: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... +38: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... +22: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... +36: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... +36: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... +36: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... +42: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... +42: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... +58: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... +58: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... +58: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... +44: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... +44: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... +44: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... +44: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... +44: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... +34: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... +34: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... +34: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... +34: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... +34: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... +28: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... +32: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... +32: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... +32: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... +50: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... +46: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... +46: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... +26: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... +26: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... +26: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... +49: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... +49: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... + 5: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... +20: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... +20: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... +41: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... +41: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... +41: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... +41: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... +55: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... +55: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... +55: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... +55: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... +10: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +10: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +10: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +10: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +37: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... +37: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... +37: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... + 8: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... +18: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... +18: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... +21: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... +21: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... +33: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... +33: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... +33: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... +33: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... +33: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... +52: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... + 6: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... + 6: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... + 6: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +31: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... +61: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... +61: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... +61: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... +15: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... +15: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... +15: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... +45: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... +47: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... +47: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... +47: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... + 9: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... + 9: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... +59: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... +12: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... +12: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... +12: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... + 2: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... +29: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... +29: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... +35: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... +19: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... +48: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... +48: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... +48: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... +53: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... +53: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... +53: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... +53: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... +40: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... +30: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... +23: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... +16: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... + 0: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +56: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... +56: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... +62: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... + 4: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... +60: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... +63: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... +17: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... +17: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... + 1: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... +39: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... + 7: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... + 7: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +11: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... +57: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... +27: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... +27: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... +27: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... +43: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... +43: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... +25: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... +13: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... +13: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... +14: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... +14: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... +38: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... +38: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... +38: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... +22: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... +22: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... +36: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... +36: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... +36: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... +36: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... +24: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... +24: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... +24: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... +24: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... +42: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... +58: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... +44: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... +28: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... +32: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... +32: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... +50: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... +50: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... +50: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... +46: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... +26: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... +49: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... +49: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... + 5: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... + 5: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... +20: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... +41: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... +41: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... +41: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... +55: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... +55: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... +10: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... + 8: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... +18: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... +21: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... +21: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... +33: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... +52: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... +52: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... + 6: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... +31: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... +61: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... +61: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... +61: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... +15: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +45: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... + 3: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... + 3: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... + 3: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... + 3: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... + 3: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... + 9: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... + 9: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +59: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... +59: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... +12: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +12: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... + 2: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... + 2: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... + 2: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +29: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... +35: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... +19: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... +48: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... +40: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... +40: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... +30: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... +23: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... +23: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... +16: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... +16: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... +16: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... +16: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... + 0: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... + 0: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... +62: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... +62: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... + 4: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... + 4: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... + 4: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +17: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... + 1: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... +39: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... + 7: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... +11: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... +57: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... +27: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... +43: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... +43: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... +25: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... +25: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... +25: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... +13: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +14: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... +14: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +38: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... +22: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... +22: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... +36: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... +24: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... +24: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... +42: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... +44: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... +28: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... +50: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... +26: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... +49: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... + 5: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... +20: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... +41: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... +55: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... +10: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... + 8: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +18: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... +18: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... +18: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... +18: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... +21: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... +33: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... +33: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... +52: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... + 6: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... + 6: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... +31: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... +15: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... +15: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +45: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... + 3: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... + 9: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... + 9: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +59: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... +12: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... + 2: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... + 2: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... + 2: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +29: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... +35: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... +48: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... +30: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... +23: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... +23: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... +16: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... + 0: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... + 4: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +17: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... + 1: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +39: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... +11: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +57: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... +27: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... +43: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... +43: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... +25: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... +25: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... +13: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +14: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +22: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... +24: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... +44: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... +28: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... +50: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... +26: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... +26: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... + 5: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +20: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... +20: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... +55: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... +10: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... + 8: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +18: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... +21: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... +52: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... + 6: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +31: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... +15: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +45: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... + 3: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... + 9: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... + 9: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +12: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... + 2: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +29: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... +29: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... +29: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... +35: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... +19: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... +19: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... +40: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... +30: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... +23: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... +16: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... + 4: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +17: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... + 1: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... + 1: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +11: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +11: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... +11: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +57: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... +27: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... +25: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... +13: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +14: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +14: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +22: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... +24: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... +28: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... +28: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... +26: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... + 5: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... + 5: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... + 5: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +20: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... +20: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... + 8: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +21: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... +21: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... + 6: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +31: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... +15: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... + 3: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... +12: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... +29: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... +35: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... +19: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... +19: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... +30: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... +23: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... + 4: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... + 1: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +13: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... +22: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... +28: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... + 8: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +31: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... +19: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... +23: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... +13: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +31: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... +30: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... +31: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... +30: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... +30: [2022-12-01 23:17:05,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... + 0: [2022-12-01 23:17:05,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. + 0: [2022-12-01 23:17:05,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt + 0: [2022-12-01 23:17:05,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +32: [2022-12-01 23:17:05,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. +32: [2022-12-01 23:17:05,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt +32: [2022-12-01 23:17:05,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +32: [2022-12-01 23:17:05,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. +32: [2022-12-01 23:17:05,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt +32: [2022-12-01 23:17:05,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 0: [2022-12-01 23:17:05,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. + 0: [2022-12-01 23:17:05,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. + 0: [2022-12-01 23:17:05,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt + 0: [2022-12-01 23:17:05,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 0: [2022-12-01 23:17:05,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. + 0: [2022-12-01 23:17:05,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt + 0: [2022-12-01 23:17:05,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 0: [2022-12-01 23:17:05,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. + 0: [2022-12-01 23:17:05,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt + 0: [2022-12-01 23:17:05,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 0: [2022-12-01 23:17:05,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. + 0: [2022-12-01 23:17:05,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt + 0: [2022-12-01 23:17:05,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 0: [2022-12-01 23:17:05,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. + 0: [2022-12-01 23:17:05,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt + 0: [2022-12-01 23:17:05,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 0: [2022-12-01 23:17:05,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. + 0: [2022-12-01 23:17:05,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt + 0: [2022-12-01 23:17:05,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +32: [2022-12-01 23:17:05,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. +32: [2022-12-01 23:17:05,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt +32: [2022-12-01 23:17:05,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +32: [2022-12-01 23:17:05,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. +32: [2022-12-01 23:17:05,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt +32: [2022-12-01 23:17:05,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +32: [2022-12-01 23:17:05,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. +32: [2022-12-01 23:17:05,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. +32: [2022-12-01 23:17:05,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt +32: [2022-12-01 23:17:05,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt +32: [2022-12-01 23:17:05,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +32: [2022-12-01 23:17:05,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +32: [2022-12-01 23:17:05,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. +32: [2022-12-01 23:17:05,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt +32: [2022-12-01 23:17:05,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +32: [2022-12-01 23:17:05,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. +32: [2022-12-01 23:17:05,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt +32: [2022-12-01 23:17:05,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 9: [2022-12-01 23:17:05,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. + 9: [2022-12-01 23:17:05,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. + 9: [2022-12-01 23:17:05,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt + 9: [2022-12-01 23:17:05,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt + 9: [2022-12-01 23:17:05,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 9: [2022-12-01 23:17:05,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 9: [2022-12-01 23:17:05,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. + 9: [2022-12-01 23:17:05,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt + 9: [2022-12-01 23:17:05,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +35: [2022-12-01 23:17:05,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. +35: [2022-12-01 23:17:05,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt +35: [2022-12-01 23:17:05,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +35: [2022-12-01 23:17:05,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. +35: [2022-12-01 23:17:05,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt +35: [2022-12-01 23:17:05,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +24: [2022-12-01 23:17:05,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. +24: [2022-12-01 23:17:05,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. +24: [2022-12-01 23:17:05,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt +24: [2022-12-01 23:17:05,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt +24: [2022-12-01 23:17:05,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +24: [2022-12-01 23:17:05,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +35: [2022-12-01 23:17:05,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. +35: [2022-12-01 23:17:05,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt +35: [2022-12-01 23:17:05,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +35: [2022-12-01 23:17:05,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. +35: [2022-12-01 23:17:05,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt +35: [2022-12-01 23:17:05,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 9: [2022-12-01 23:17:05,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. + 9: [2022-12-01 23:17:05,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt + 9: [2022-12-01 23:17:05,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 9: [2022-12-01 23:17:05,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. + 9: [2022-12-01 23:17:05,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt + 9: [2022-12-01 23:17:05,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +12: [2022-12-01 23:17:05,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. +12: [2022-12-01 23:17:05,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +12: [2022-12-01 23:17:05,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +12: [2022-12-01 23:17:05,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt +12: [2022-12-01 23:17:05,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +12: [2022-12-01 23:17:05,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +12: [2022-12-01 23:17:05,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +12: [2022-12-01 23:17:05,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +12: [2022-12-01 23:17:05,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +12: [2022-12-01 23:17:05,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. +12: [2022-12-01 23:17:05,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt +12: [2022-12-01 23:17:05,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +12: [2022-12-01 23:17:05,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +12: [2022-12-01 23:17:05,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +12: [2022-12-01 23:17:05,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +12: [2022-12-01 23:17:05,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. +12: [2022-12-01 23:17:05,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt +12: [2022-12-01 23:17:05,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +12: [2022-12-01 23:17:05,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. +12: [2022-12-01 23:17:05,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt +12: [2022-12-01 23:17:05,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. +56: [2022-12-01 23:17:05,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. +56: [2022-12-01 23:17:05,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. +56: [2022-12-01 23:17:05,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt +56: [2022-12-01 23:17:05,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt +56: [2022-12-01 23:17:05,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt +56: [2022-12-01 23:17:05,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. +56: [2022-12-01 23:17:05,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. +35: [2022-12-01 23:17:05,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. +35: [2022-12-01 23:17:05,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt +35: [2022-12-01 23:17:05,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt +56: [2022-12-01 23:17:05,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt +56: [2022-12-01 23:17:05,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +22: [2022-12-01 23:17:05,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. +34: [2022-12-01 23:17:05,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. + 6: [2022-12-01 23:17:05,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +22: [2022-12-01 23:17:05,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. +34: [2022-12-01 23:17:05,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt + 6: [2022-12-01 23:17:05,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +22: [2022-12-01 23:17:05,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt +34: [2022-12-01 23:17:05,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 6: [2022-12-01 23:17:05,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +22: [2022-12-01 23:17:05,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +22: [2022-12-01 23:17:05,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt +34: [2022-12-01 23:17:05,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. + 6: [2022-12-01 23:17:05,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +22: [2022-12-01 23:17:05,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +34: [2022-12-01 23:17:05,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt + 6: [2022-12-01 23:17:05,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +22: [2022-12-01 23:17:05,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. +34: [2022-12-01 23:17:05,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 6: [2022-12-01 23:17:05,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +22: [2022-12-01 23:17:05,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt +22: [2022-12-01 23:17:05,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +22: [2022-12-01 23:17:05,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. +22: [2022-12-01 23:17:05,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt +22: [2022-12-01 23:17:05,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +22: [2022-12-01 23:17:05,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. +22: [2022-12-01 23:17:05,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt +22: [2022-12-01 23:17:05,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +50: [2022-12-01 23:17:05,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. +50: [2022-12-01 23:17:05,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt +50: [2022-12-01 23:17:05,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +51: [2022-12-01 23:17:05,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. +51: [2022-12-01 23:17:05,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt +51: [2022-12-01 23:17:05,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +51: [2022-12-01 23:17:05,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. +51: [2022-12-01 23:17:05,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt +51: [2022-12-01 23:17:05,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. +30: [2022-12-01 23:17:05,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt +30: [2022-12-01 23:17:05,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt +30: [2022-12-01 23:17:05,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +30: [2022-12-01 23:17:05,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt +30: [2022-12-01 23:17:05,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +30: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +51: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. +51: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. +51: [2022-12-01 23:17:05,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt +51: [2022-12-01 23:17:05,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt +51: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +51: [2022-12-01 23:17:05,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +47: [2022-12-01 23:17:05,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. +47: [2022-12-01 23:17:05,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. +47: [2022-12-01 23:17:05,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt +47: [2022-12-01 23:17:05,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt +47: [2022-12-01 23:17:05,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +47: [2022-12-01 23:17:05,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +46: [2022-12-01 23:17:05,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. +46: [2022-12-01 23:17:05,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt +46: [2022-12-01 23:17:05,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +46: [2022-12-01 23:17:05,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. +46: [2022-12-01 23:17:05,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt +46: [2022-12-01 23:17:05,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +46: [2022-12-01 23:17:05,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. +46: [2022-12-01 23:17:05,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. +46: [2022-12-01 23:17:05,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. +46: [2022-12-01 23:17:05,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt +46: [2022-12-01 23:17:05,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt +46: [2022-12-01 23:17:05,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +46: [2022-12-01 23:17:05,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt +46: [2022-12-01 23:17:05,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +46: [2022-12-01 23:17:05,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 6: [2022-12-01 23:17:05,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. + 9: [2022-12-01 23:17:05,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. + 9: [2022-12-01 23:17:05,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt + 9: [2022-12-01 23:17:05,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +34: [2022-12-01 23:17:05,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. +34: [2022-12-01 23:17:05,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. + 6: [2022-12-01 23:17:05,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt + 6: [2022-12-01 23:17:05,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +34: [2022-12-01 23:17:05,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. +34: [2022-12-01 23:17:05,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt +34: [2022-12-01 23:17:05,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt +34: [2022-12-01 23:17:05,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 6: [2022-12-01 23:17:05,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. +34: [2022-12-01 23:17:05,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 6: [2022-12-01 23:17:05,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt +34: [2022-12-01 23:17:05,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt + 6: [2022-12-01 23:17:05,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +34: [2022-12-01 23:17:05,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +50: [2022-12-01 23:17:05,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. +50: [2022-12-01 23:17:05,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt +50: [2022-12-01 23:17:05,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +50: [2022-12-01 23:17:05,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. +50: [2022-12-01 23:17:05,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt +50: [2022-12-01 23:17:05,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +50: [2022-12-01 23:17:05,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. +50: [2022-12-01 23:17:05,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt +50: [2022-12-01 23:17:05,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +50: [2022-12-01 23:17:05,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. +50: [2022-12-01 23:17:05,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt +50: [2022-12-01 23:17:05,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +57: [2022-12-01 23:17:05,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. +31: [2022-12-01 23:17:05,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. +31: [2022-12-01 23:17:05,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. +31: [2022-12-01 23:17:05,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. +31: [2022-12-01 23:17:05,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt +31: [2022-12-01 23:17:05,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt +31: [2022-12-01 23:17:05,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt +31: [2022-12-01 23:17:05,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +31: [2022-12-01 23:17:05,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +31: [2022-12-01 23:17:05,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +50: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. +50: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. +50: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. +50: [2022-12-01 23:17:05,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt +50: [2022-12-01 23:17:05,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt +57: [2022-12-01 23:17:05,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt +50: [2022-12-01 23:17:05,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt +57: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +50: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +50: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +57: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. +50: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +57: [2022-12-01 23:17:05,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt +57: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. +57: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +57: [2022-12-01 23:17:05,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt +57: [2022-12-01 23:17:05,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +31: [2022-12-01 23:17:05,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. +31: [2022-12-01 23:17:05,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. +31: [2022-12-01 23:17:05,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt +31: [2022-12-01 23:17:05,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +31: [2022-12-01 23:17:05,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt +31: [2022-12-01 23:17:05,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +44: [2022-12-01 23:17:05,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. +44: [2022-12-01 23:17:05,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. +37: [2022-12-01 23:17:05,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt +37: [2022-12-01 23:17:05,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt +37: [2022-12-01 23:17:05,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt +37: [2022-12-01 23:17:05,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt +37: [2022-12-01 23:17:05,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +37: [2022-12-01 23:17:05,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +44: [2022-12-01 23:17:05,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. +44: [2022-12-01 23:17:05,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt +44: [2022-12-01 23:17:05,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +44: [2022-12-01 23:17:05,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt +44: [2022-12-01 23:17:05,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt +44: [2022-12-01 23:17:05,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +44: [2022-12-01 23:17:05,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +47: [2022-12-01 23:17:05,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. +47: [2022-12-01 23:17:05,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt +47: [2022-12-01 23:17:05,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +24: [2022-12-01 23:17:05,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. +24: [2022-12-01 23:17:05,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt +24: [2022-12-01 23:17:05,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +28: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. +28: [2022-12-01 23:17:05,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt +28: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +28: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. +28: [2022-12-01 23:17:05,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt +28: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +28: [2022-12-01 23:17:05,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. +28: [2022-12-01 23:17:05,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt +28: [2022-12-01 23:17:05,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +57: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. +42: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. +42: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. +42: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. +11: [2022-12-01 23:17:05,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. +11: [2022-12-01 23:17:05,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +42: [2022-12-01 23:17:05,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt +42: [2022-12-01 23:17:05,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt +41: [2022-12-01 23:17:05,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. +41: [2022-12-01 23:17:05,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. +41: [2022-12-01 23:17:05,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. + 6: [2022-12-01 23:17:05,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +42: [2022-12-01 23:17:05,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt +42: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +42: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +42: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +41: [2022-12-01 23:17:05,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. +41: [2022-12-01 23:17:05,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt +41: [2022-12-01 23:17:05,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt +41: [2022-12-01 23:17:05,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt +41: [2022-12-01 23:17:05,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +41: [2022-12-01 23:17:05,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +41: [2022-12-01 23:17:05,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +41: [2022-12-01 23:17:05,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt +41: [2022-12-01 23:17:05,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +57: [2022-12-01 23:17:05,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt +57: [2022-12-01 23:17:05,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +57: [2022-12-01 23:17:05,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. +57: [2022-12-01 23:17:05,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt +57: [2022-12-01 23:17:05,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +57: [2022-12-01 23:17:05,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. +57: [2022-12-01 23:17:05,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt +57: [2022-12-01 23:17:05,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. + 2: [2022-12-01 23:17:05,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. + 2: [2022-12-01 23:17:05,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. +19: [2022-12-01 23:17:05,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. +19: [2022-12-01 23:17:05,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. +19: [2022-12-01 23:17:05,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt +19: [2022-12-01 23:17:05,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt +19: [2022-12-01 23:17:05,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +19: [2022-12-01 23:17:05,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +19: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. +19: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. +19: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. +19: [2022-12-01 23:17:05,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt +19: [2022-12-01 23:17:05,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt +19: [2022-12-01 23:17:05,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt +19: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +19: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +19: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +43: [2022-12-01 23:17:05,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. +43: [2022-12-01 23:17:05,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. +43: [2022-12-01 23:17:05,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. +43: [2022-12-01 23:17:05,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt +43: [2022-12-01 23:17:05,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +43: [2022-12-01 23:17:05,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt +43: [2022-12-01 23:17:05,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt +43: [2022-12-01 23:17:05,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +43: [2022-12-01 23:17:05,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +43: [2022-12-01 23:17:05,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. +43: [2022-12-01 23:17:05,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. +43: [2022-12-01 23:17:05,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt +43: [2022-12-01 23:17:05,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +43: [2022-12-01 23:17:05,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt +43: [2022-12-01 23:17:05,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +30: [2022-12-01 23:17:05,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. + 9: [2022-12-01 23:17:05,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +58: [2022-12-01 23:17:05,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. +30: [2022-12-01 23:17:05,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt +30: [2022-12-01 23:17:05,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +11: [2022-12-01 23:17:05,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt +11: [2022-12-01 23:17:05,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt + 6: [2022-12-01 23:17:05,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. +11: [2022-12-01 23:17:05,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +11: [2022-12-01 23:17:05,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 6: [2022-12-01 23:17:05,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +11: [2022-12-01 23:17:05,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +48: [2022-12-01 23:17:05,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt +48: [2022-12-01 23:17:05,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt +48: [2022-12-01 23:17:05,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt +11: [2022-12-01 23:17:05,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +48: [2022-12-01 23:17:05,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt +48: [2022-12-01 23:17:05,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt +11: [2022-12-01 23:17:05,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 9: [2022-12-01 23:17:05,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 9: [2022-12-01 23:17:05,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +48: [2022-12-01 23:17:05,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt + 2: [2022-12-01 23:17:05,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt + 1: [2022-12-01 23:17:05,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. + 1: [2022-12-01 23:17:05,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. + 1: [2022-12-01 23:17:05,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. +58: [2022-12-01 23:17:05,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt + 2: [2022-12-01 23:17:05,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt +58: [2022-12-01 23:17:05,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +58: [2022-12-01 23:17:05,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. + 2: [2022-12-01 23:17:05,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +58: [2022-12-01 23:17:05,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. + 2: [2022-12-01 23:17:05,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. +58: [2022-12-01 23:17:05,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. +58: [2022-12-01 23:17:05,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt + 2: [2022-12-01 23:17:05,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt +58: [2022-12-01 23:17:05,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt +58: [2022-12-01 23:17:05,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +58: [2022-12-01 23:17:05,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt +58: [2022-12-01 23:17:05,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +58: [2022-12-01 23:17:05,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt + 2: [2022-12-01 23:17:05,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. + 2: [2022-12-01 23:17:05,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt + 2: [2022-12-01 23:17:05,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +51: [2022-12-01 23:17:05,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. +58: [2022-12-01 23:17:05,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. +58: [2022-12-01 23:17:05,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt +58: [2022-12-01 23:17:05,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +51: [2022-12-01 23:17:05,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt +51: [2022-12-01 23:17:05,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +47: [2022-12-01 23:17:05,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. +47: [2022-12-01 23:17:05,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt +47: [2022-12-01 23:17:05,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +47: [2022-12-01 23:17:05,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. +47: [2022-12-01 23:17:05,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt +47: [2022-12-01 23:17:05,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +58: [2022-12-01 23:17:05,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. +58: [2022-12-01 23:17:05,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. +12: [2022-12-01 23:17:05,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +58: [2022-12-01 23:17:05,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. +58: [2022-12-01 23:17:05,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt +58: [2022-12-01 23:17:05,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt +58: [2022-12-01 23:17:05,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt +58: [2022-12-01 23:17:05,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +58: [2022-12-01 23:17:05,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +58: [2022-12-01 23:17:05,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +12: [2022-12-01 23:17:05,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +12: [2022-12-01 23:17:05,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +28: [2022-12-01 23:17:05,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. +57: [2022-12-01 23:17:05,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. +57: [2022-12-01 23:17:05,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt +57: [2022-12-01 23:17:05,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +57: [2022-12-01 23:17:05,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. +57: [2022-12-01 23:17:05,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt +57: [2022-12-01 23:17:05,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +28: [2022-12-01 23:17:05,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt +28: [2022-12-01 23:17:05,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +28: [2022-12-01 23:17:05,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. +28: [2022-12-01 23:17:05,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt +28: [2022-12-01 23:17:05,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +28: [2022-12-01 23:17:05,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. +28: [2022-12-01 23:17:05,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt +28: [2022-12-01 23:17:05,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. +39: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. +39: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. +41: [2022-12-01 23:17:05,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. +39: [2022-12-01 23:17:05,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt +39: [2022-12-01 23:17:05,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt +39: [2022-12-01 23:17:05,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt +41: [2022-12-01 23:17:05,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt +41: [2022-12-01 23:17:05,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. +23: [2022-12-01 23:17:05,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt +39: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +41: [2022-12-01 23:17:05,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +23: [2022-12-01 23:17:05,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt +23: [2022-12-01 23:17:05,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt +23: [2022-12-01 23:17:05,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt +23: [2022-12-01 23:17:05,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt +39: [2022-12-01 23:17:05,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. +41: [2022-12-01 23:17:05,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +23: [2022-12-01 23:17:05,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt +41: [2022-12-01 23:17:05,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +41: [2022-12-01 23:17:05,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. +39: [2022-12-01 23:17:05,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. +41: [2022-12-01 23:17:05,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt +39: [2022-12-01 23:17:05,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt +41: [2022-12-01 23:17:05,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 3: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. + 3: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. + 3: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. + 3: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. + 3: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. + 3: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt + 3: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt + 3: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt + 3: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt + 3: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt + 3: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 3: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 3: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 3: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 3: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. +33: [2022-12-01 23:17:05,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt +33: [2022-12-01 23:17:05,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt +33: [2022-12-01 23:17:05,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt +33: [2022-12-01 23:17:05,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt +33: [2022-12-01 23:17:05,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +33: [2022-12-01 23:17:05,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt + 1: [2022-12-01 23:17:05,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +33: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. +38: [2022-12-01 23:17:05,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. +38: [2022-12-01 23:17:05,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt +38: [2022-12-01 23:17:05,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +38: [2022-12-01 23:17:05,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt +38: [2022-12-01 23:17:05,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt +38: [2022-12-01 23:17:05,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt +25: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt +25: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt +25: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt +38: [2022-12-01 23:17:05,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. +29: [2022-12-01 23:17:05,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. +24: [2022-12-01 23:17:05,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. +29: [2022-12-01 23:17:05,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt +26: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt +26: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. +26: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt +29: [2022-12-01 23:17:05,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +26: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt +26: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt +26: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt +26: [2022-12-01 23:17:05,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt +29: [2022-12-01 23:17:05,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +42: [2022-12-01 23:17:05,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. +24: [2022-12-01 23:17:05,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt +24: [2022-12-01 23:17:05,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +26: [2022-12-01 23:17:05,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. +26: [2022-12-01 23:17:05,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. +29: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +26: [2022-12-01 23:17:05,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt +29: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt +26: [2022-12-01 23:17:05,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. +29: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt +29: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. +29: [2022-12-01 23:17:05,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt +29: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. +29: [2022-12-01 23:17:05,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt +29: [2022-12-01 23:17:05,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +42: [2022-12-01 23:17:05,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt +42: [2022-12-01 23:17:05,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +44: [2022-12-01 23:17:05,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. +29: [2022-12-01 23:17:05,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. +44: [2022-12-01 23:17:05,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt +44: [2022-12-01 23:17:05,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +29: [2022-12-01 23:17:05,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt +29: [2022-12-01 23:17:05,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. +45: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. +45: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. +45: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. +53: [2022-12-01 23:17:05,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt +53: [2022-12-01 23:17:05,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt +53: [2022-12-01 23:17:05,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt +53: [2022-12-01 23:17:05,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +53: [2022-12-01 23:17:05,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt +53: [2022-12-01 23:17:05,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +53: [2022-12-01 23:17:05,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +45: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. +45: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. +45: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. +45: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. +45: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt +45: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt +45: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt +45: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt +45: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt +45: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt +45: [2022-12-01 23:17:05,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt +45: [2022-12-01 23:17:05,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +45: [2022-12-01 23:17:05,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +45: [2022-12-01 23:17:05,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +45: [2022-12-01 23:17:05,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +45: [2022-12-01 23:17:05,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +45: [2022-12-01 23:17:05,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +45: [2022-12-01 23:17:05,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. +54: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt +54: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt +54: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt +54: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt +54: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +54: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +54: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt +54: [2022-12-01 23:17:05,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +54: [2022-12-01 23:17:05,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 3: [2022-12-01 23:17:05,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. + 3: [2022-12-01 23:17:05,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt + 3: [2022-12-01 23:17:05,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. +55: [2022-12-01 23:17:05,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt +55: [2022-12-01 23:17:05,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt +55: [2022-12-01 23:17:05,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +55: [2022-12-01 23:17:05,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt +55: [2022-12-01 23:17:05,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt +55: [2022-12-01 23:17:05,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +55: [2022-12-01 23:17:05,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt +55: [2022-12-01 23:17:05,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +55: [2022-12-01 23:17:05,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +55: [2022-12-01 23:17:05,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +55: [2022-12-01 23:17:05,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +45: [2022-12-01 23:17:05,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. +45: [2022-12-01 23:17:05,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt +45: [2022-12-01 23:17:05,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 0: [2022-12-01 23:17:05,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt + 0: [2022-12-01 23:17:05,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. + 7: [2022-12-01 23:17:05,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt + 7: [2022-12-01 23:17:05,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt + 7: [2022-12-01 23:17:05,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt + 7: [2022-12-01 23:17:05,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt + 7: [2022-12-01 23:17:05,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 7: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +47: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. +47: [2022-12-01 23:17:05,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt +47: [2022-12-01 23:17:05,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. +49: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt +49: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt +49: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt +49: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt +49: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +49: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +49: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. +16: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt +16: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt +16: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt +16: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt +16: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt +16: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt +16: [2022-12-01 23:17:05,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +16: [2022-12-01 23:17:05,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +16: [2022-12-01 23:17:05,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. +16: [2022-12-01 23:17:05,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt +16: [2022-12-01 23:17:05,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. +27: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt +27: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt +27: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt +27: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt +27: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +27: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +27: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. + 8: [2022-12-01 23:17:05,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. + 8: [2022-12-01 23:17:05,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 8: [2022-12-01 23:17:05,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt + 8: [2022-12-01 23:17:05,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt +40: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt + 8: [2022-12-01 23:17:05,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt + 8: [2022-12-01 23:17:05,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt + 8: [2022-12-01 23:17:05,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt +40: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt +40: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. +40: [2022-12-01 23:17:05,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 8: [2022-12-01 23:17:05,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. +52: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt +52: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt +52: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt +52: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt +52: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt +52: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt +52: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +52: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +52: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +19: [2022-12-01 23:17:05,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. +19: [2022-12-01 23:17:05,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt +19: [2022-12-01 23:17:05,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +13: [2022-12-01 23:17:05,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt +13: [2022-12-01 23:17:05,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt +13: [2022-12-01 23:17:05,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +13: [2022-12-01 23:17:05,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt +13: [2022-12-01 23:17:05,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +13: [2022-12-01 23:17:05,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt +13: [2022-12-01 23:17:05,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +13: [2022-12-01 23:17:05,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +13: [2022-12-01 23:17:05,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +61: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt +61: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt +61: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt +61: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt +61: [2022-12-01 23:17:05,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +61: [2022-12-01 23:17:05,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. +63: [2022-12-01 23:17:05,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt +63: [2022-12-01 23:17:05,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt +63: [2022-12-01 23:17:05,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt +63: [2022-12-01 23:17:05,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt +63: [2022-12-01 23:17:05,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt +63: [2022-12-01 23:17:05,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt +63: [2022-12-01 23:17:05,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +63: [2022-12-01 23:17:05,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +63: [2022-12-01 23:17:05,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +63: [2022-12-01 23:17:05,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt +63: [2022-12-01 23:17:05,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. +14: [2022-12-01 23:17:05,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +14: [2022-12-01 23:17:05,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +14: [2022-12-01 23:17:05,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt +14: [2022-12-01 23:17:05,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +14: [2022-12-01 23:17:05,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +14: [2022-12-01 23:17:05,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +14: [2022-12-01 23:17:05,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +14: [2022-12-01 23:17:05,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +20: [2022-12-01 23:17:05,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. +20: [2022-12-01 23:17:05,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. +20: [2022-12-01 23:17:05,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. +20: [2022-12-01 23:17:05,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. +20: [2022-12-01 23:17:05,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. +20: [2022-12-01 23:17:05,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. +39: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. +20: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. +20: [2022-12-01 23:17:05,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt +20: [2022-12-01 23:17:05,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt +20: [2022-12-01 23:17:05,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt +20: [2022-12-01 23:17:05,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt +20: [2022-12-01 23:17:05,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt +20: [2022-12-01 23:17:05,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt +20: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +20: [2022-12-01 23:17:05,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt +20: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +20: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +20: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +20: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +20: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +20: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt +39: [2022-12-01 23:17:05,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +15: [2022-12-01 23:17:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +15: [2022-12-01 23:17:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt +15: [2022-12-01 23:17:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt +15: [2022-12-01 23:17:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +15: [2022-12-01 23:17:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +15: [2022-12-01 23:17:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +15: [2022-12-01 23:17:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +44: [2022-12-01 23:17:05,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. +44: [2022-12-01 23:17:05,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt +44: [2022-12-01 23:17:05,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. +59: [2022-12-01 23:17:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. +59: [2022-12-01 23:17:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. +59: [2022-12-01 23:17:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. +59: [2022-12-01 23:17:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. +59: [2022-12-01 23:17:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. +59: [2022-12-01 23:17:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt +59: [2022-12-01 23:17:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt +59: [2022-12-01 23:17:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt +59: [2022-12-01 23:17:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt +59: [2022-12-01 23:17:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt +59: [2022-12-01 23:17:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +59: [2022-12-01 23:17:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +59: [2022-12-01 23:17:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +59: [2022-12-01 23:17:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +59: [2022-12-01 23:17:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. +62: [2022-12-01 23:17:05,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt +62: [2022-12-01 23:17:05,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. +62: [2022-12-01 23:17:05,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt +62: [2022-12-01 23:17:05,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +42: [2022-12-01 23:17:05,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. +61: [2022-12-01 23:17:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. +61: [2022-12-01 23:17:05,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt +61: [2022-12-01 23:17:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. +21: [2022-12-01 23:17:05,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt +21: [2022-12-01 23:17:05,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt +21: [2022-12-01 23:17:05,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt +21: [2022-12-01 23:17:05,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt +21: [2022-12-01 23:17:05,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +21: [2022-12-01 23:17:05,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +42: [2022-12-01 23:17:05,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt +42: [2022-12-01 23:17:05,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. + 5: [2022-12-01 23:17:05,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt + 5: [2022-12-01 23:17:05,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. + 5: [2022-12-01 23:17:05,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt + 5: [2022-12-01 23:17:05,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. + 5: [2022-12-01 23:17:05,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt + 5: [2022-12-01 23:17:05,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. + 5: [2022-12-01 23:17:05,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt + 5: [2022-12-01 23:17:05,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 5: [2022-12-01 23:17:05,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 5: [2022-12-01 23:17:05,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 5: [2022-12-01 23:17:05,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt + 5: [2022-12-01 23:17:05,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 5: [2022-12-01 23:17:05,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt + 5: [2022-12-01 23:17:05,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 5: [2022-12-01 23:17:05,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +17: [2022-12-01 23:17:05,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. +17: [2022-12-01 23:17:05,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. +17: [2022-12-01 23:17:05,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. +17: [2022-12-01 23:17:05,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. +17: [2022-12-01 23:17:05,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. +17: [2022-12-01 23:17:05,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. +17: [2022-12-01 23:17:05,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. +17: [2022-12-01 23:17:05,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt +17: [2022-12-01 23:17:05,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt +17: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. +17: [2022-12-01 23:17:05,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt +17: [2022-12-01 23:17:05,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt +17: [2022-12-01 23:17:05,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt +17: [2022-12-01 23:17:05,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt +17: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +17: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +17: [2022-12-01 23:17:05,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt +17: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +17: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +17: [2022-12-01 23:17:05,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt +17: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +17: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +17: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +17: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. +60: [2022-12-01 23:17:05,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt +60: [2022-12-01 23:17:05,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt +60: [2022-12-01 23:17:05,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt +60: [2022-12-01 23:17:05,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt +60: [2022-12-01 23:17:05,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt +60: [2022-12-01 23:17:05,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +60: [2022-12-01 23:17:05,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt +62: [2022-12-01 23:17:05,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +62: [2022-12-01 23:17:05,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. +62: [2022-12-01 23:17:05,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt +62: [2022-12-01 23:17:05,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. + 4: [2022-12-01 23:17:05,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt + 4: [2022-12-01 23:17:05,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. + 4: [2022-12-01 23:17:05,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt + 4: [2022-12-01 23:17:05,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt + 4: [2022-12-01 23:17:05,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt + 4: [2022-12-01 23:17:05,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 4: [2022-12-01 23:17:05,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 4: [2022-12-01 23:17:05,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 4: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. + 1: [2022-12-01 23:17:05,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt + 1: [2022-12-01 23:17:05,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. + 1: [2022-12-01 23:17:05,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. + 1: [2022-12-01 23:17:05,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt + 1: [2022-12-01 23:17:05,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt + 1: [2022-12-01 23:17:05,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. +36: [2022-12-01 23:17:05,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt +36: [2022-12-01 23:17:05,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt +36: [2022-12-01 23:17:05,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt +36: [2022-12-01 23:17:05,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt +36: [2022-12-01 23:17:05,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt +36: [2022-12-01 23:17:05,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt +36: [2022-12-01 23:17:05,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt +36: [2022-12-01 23:17:05,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +36: [2022-12-01 23:17:05,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +24: [2022-12-01 23:17:05,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. +24: [2022-12-01 23:17:05,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt +24: [2022-12-01 23:17:05,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. +10: [2022-12-01 23:17:05,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +10: [2022-12-01 23:17:05,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +10: [2022-12-01 23:17:05,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +10: [2022-12-01 23:17:05,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt +10: [2022-12-01 23:17:05,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +10: [2022-12-01 23:17:05,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +10: [2022-12-01 23:17:05,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. +18: [2022-12-01 23:17:05,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +18: [2022-12-01 23:17:05,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt +18: [2022-12-01 23:17:05,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt +18: [2022-12-01 23:17:05,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt +18: [2022-12-01 23:17:05,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt +18: [2022-12-01 23:17:05,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt +18: [2022-12-01 23:17:05,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt +18: [2022-12-01 23:17:05,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +18: [2022-12-01 23:17:05,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +51: [2022-12-01 23:17:05,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. +51: [2022-12-01 23:17:05,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt +51: [2022-12-01 23:17:05,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +60: [2022-12-01 23:17:05,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. +60: [2022-12-01 23:17:05,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt +60: [2022-12-01 23:17:05,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +10: [2022-12-01 23:17:05,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. +10: [2022-12-01 23:17:05,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt +10: [2022-12-01 23:17:05,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +35: [2022-12-01 23:17:05,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. +35: [2022-12-01 23:17:05,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt +35: [2022-12-01 23:17:05,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +35: [2022-12-01 23:17:05,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. +35: [2022-12-01 23:17:05,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt +35: [2022-12-01 23:17:05,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +24: [2022-12-01 23:17:05,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. +24: [2022-12-01 23:17:05,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt +24: [2022-12-01 23:17:05,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +11: [2022-12-01 23:17:05,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. +11: [2022-12-01 23:17:05,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt +11: [2022-12-01 23:17:05,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +11: [2022-12-01 23:17:05,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +11: [2022-12-01 23:17:05,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +11: [2022-12-01 23:17:05,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 8: [2022-12-01 23:17:05,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. + 8: [2022-12-01 23:17:05,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt + 8: [2022-12-01 23:17:05,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +59: [2022-12-01 23:17:05,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. +59: [2022-12-01 23:17:05,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt +59: [2022-12-01 23:17:05,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +10: [2022-12-01 23:17:05,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. +10: [2022-12-01 23:17:05,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt +10: [2022-12-01 23:17:05,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +34: [2022-12-01 23:17:05,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. +34: [2022-12-01 23:17:05,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt +34: [2022-12-01 23:17:05,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +37: [2022-12-01 23:17:05,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. +37: [2022-12-01 23:17:05,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt +37: [2022-12-01 23:17:05,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +11: [2022-12-01 23:17:05,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. +11: [2022-12-01 23:17:05,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt +11: [2022-12-01 23:17:05,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +59: [2022-12-01 23:17:05,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. +23: [2022-12-01 23:17:05,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. +23: [2022-12-01 23:17:05,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt +23: [2022-12-01 23:17:05,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +59: [2022-12-01 23:17:05,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt +59: [2022-12-01 23:17:05,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 6: [2022-12-01 23:17:05,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. + 6: [2022-12-01 23:17:05,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt + 6: [2022-12-01 23:17:05,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +35: [2022-12-01 23:17:05,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. +35: [2022-12-01 23:17:05,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt +35: [2022-12-01 23:17:05,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +20: [2022-12-01 23:17:05,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. +20: [2022-12-01 23:17:05,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt +20: [2022-12-01 23:17:05,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. +56: [2022-12-01 23:17:05,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt +56: [2022-12-01 23:17:05,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +59: [2022-12-01 23:17:05,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. +59: [2022-12-01 23:17:05,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt +59: [2022-12-01 23:17:05,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +42: [2022-12-01 23:17:05,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. +42: [2022-12-01 23:17:05,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt +42: [2022-12-01 23:17:05,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +48: [2022-12-01 23:17:05,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. +48: [2022-12-01 23:17:05,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt +48: [2022-12-01 23:17:05,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +41: [2022-12-01 23:17:05,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. +41: [2022-12-01 23:17:05,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt +41: [2022-12-01 23:17:05,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +46: [2022-12-01 23:17:05,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. +46: [2022-12-01 23:17:05,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt +46: [2022-12-01 23:17:05,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +28: [2022-12-01 23:17:05,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. +28: [2022-12-01 23:17:05,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt +28: [2022-12-01 23:17:05,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 7: [2022-12-01 23:17:05,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. + 7: [2022-12-01 23:17:05,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt + 7: [2022-12-01 23:17:05,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +21: [2022-12-01 23:17:05,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. +21: [2022-12-01 23:17:05,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt +21: [2022-12-01 23:17:05,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +27: [2022-12-01 23:17:05,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. +27: [2022-12-01 23:17:05,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt +27: [2022-12-01 23:17:05,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. +14: [2022-12-01 23:17:05,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. +25: [2022-12-01 23:17:05,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt +25: [2022-12-01 23:17:05,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +14: [2022-12-01 23:17:05,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt +14: [2022-12-01 23:17:05,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +43: [2022-12-01 23:17:05,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. +43: [2022-12-01 23:17:05,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt +43: [2022-12-01 23:17:05,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +33: [2022-12-01 23:17:05,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. +33: [2022-12-01 23:17:05,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt +33: [2022-12-01 23:17:05,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +31: [2022-12-01 23:17:05,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. +31: [2022-12-01 23:17:05,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt +31: [2022-12-01 23:17:05,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt + 2: [2022-12-01 23:17:05,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +51: [2022-12-01 23:17:05,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. +51: [2022-12-01 23:17:05,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt +51: [2022-12-01 23:17:05,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +22: [2022-12-01 23:17:05,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. +22: [2022-12-01 23:17:05,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt +22: [2022-12-01 23:17:05,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +30: [2022-12-01 23:17:05,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. +30: [2022-12-01 23:17:05,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt +30: [2022-12-01 23:17:05,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +15: [2022-12-01 23:17:05,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. +15: [2022-12-01 23:17:05,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt +15: [2022-12-01 23:17:05,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +61: [2022-12-01 23:17:05,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. +61: [2022-12-01 23:17:05,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt +61: [2022-12-01 23:17:05,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +24: [2022-12-01 23:17:05,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. +24: [2022-12-01 23:17:05,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt +24: [2022-12-01 23:17:05,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +60: [2022-12-01 23:17:05,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. +60: [2022-12-01 23:17:05,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt +60: [2022-12-01 23:17:05,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 3: [2022-12-01 23:17:05,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. + 3: [2022-12-01 23:17:05,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt + 3: [2022-12-01 23:17:05,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +42: [2022-12-01 23:17:05,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. +42: [2022-12-01 23:17:05,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt +42: [2022-12-01 23:17:05,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +11: [2022-12-01 23:17:05,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +11: [2022-12-01 23:17:05,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +11: [2022-12-01 23:17:05,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. +19: [2022-12-01 23:17:05,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. +40: [2022-12-01 23:17:05,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt +19: [2022-12-01 23:17:05,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt +40: [2022-12-01 23:17:05,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +19: [2022-12-01 23:17:05,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +11: [2022-12-01 23:17:05,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. +11: [2022-12-01 23:17:05,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt +11: [2022-12-01 23:17:05,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +34: [2022-12-01 23:17:05,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. +34: [2022-12-01 23:17:05,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt +34: [2022-12-01 23:17:05,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. +53: [2022-12-01 23:17:05,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. +53: [2022-12-01 23:17:05,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt +53: [2022-12-01 23:17:05,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt +39: [2022-12-01 23:17:05,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +49: [2022-12-01 23:17:05,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. +49: [2022-12-01 23:17:05,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt +49: [2022-12-01 23:17:05,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +28: [2022-12-01 23:17:05,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. +28: [2022-12-01 23:17:05,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt +28: [2022-12-01 23:17:05,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. +56: [2022-12-01 23:17:05,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt +56: [2022-12-01 23:17:05,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 6: [2022-12-01 23:17:05,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. + 6: [2022-12-01 23:17:05,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt + 6: [2022-12-01 23:17:05,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +37: [2022-12-01 23:17:05,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. +37: [2022-12-01 23:17:05,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt +37: [2022-12-01 23:17:05,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +23: [2022-12-01 23:17:05,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. +23: [2022-12-01 23:17:05,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt +23: [2022-12-01 23:17:05,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +27: [2022-12-01 23:17:05,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. +27: [2022-12-01 23:17:05,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt +27: [2022-12-01 23:17:05,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +30: [2022-12-01 23:17:05,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. +30: [2022-12-01 23:17:05,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt +30: [2022-12-01 23:17:05,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. +47: [2022-12-01 23:17:05,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. +47: [2022-12-01 23:17:05,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt +47: [2022-12-01 23:17:05,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +49: [2022-12-01 23:17:05,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. +49: [2022-12-01 23:17:05,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt +49: [2022-12-01 23:17:05,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. +38: [2022-12-01 23:17:05,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. +38: [2022-12-01 23:17:05,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt +25: [2022-12-01 23:17:05,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt +38: [2022-12-01 23:17:05,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +25: [2022-12-01 23:17:05,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +56: [2022-12-01 23:17:05,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt +56: [2022-12-01 23:17:05,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +43: [2022-12-01 23:17:05,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. +43: [2022-12-01 23:17:05,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt +43: [2022-12-01 23:17:05,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +33: [2022-12-01 23:17:05,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. +33: [2022-12-01 23:17:05,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt +33: [2022-12-01 23:17:05,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +44: [2022-12-01 23:17:05,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. +44: [2022-12-01 23:17:05,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt +44: [2022-12-01 23:17:05,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +22: [2022-12-01 23:17:05,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. +22: [2022-12-01 23:17:05,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt +22: [2022-12-01 23:17:05,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +61: [2022-12-01 23:17:05,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. +15: [2022-12-01 23:17:05,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. +15: [2022-12-01 23:17:05,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt +15: [2022-12-01 23:17:05,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 3: [2022-12-01 23:17:05,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. + 3: [2022-12-01 23:17:05,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt + 3: [2022-12-01 23:17:05,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +61: [2022-12-01 23:17:05,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt +61: [2022-12-01 23:17:05,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +19: [2022-12-01 23:17:05,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. +19: [2022-12-01 23:17:05,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt +19: [2022-12-01 23:17:05,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +34: [2022-12-01 23:17:05,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. +34: [2022-12-01 23:17:05,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt +34: [2022-12-01 23:17:05,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 6: [2022-12-01 23:17:05,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. + 6: [2022-12-01 23:17:05,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt + 6: [2022-12-01 23:17:05,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 9: [2022-12-01 23:17:05,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. + 9: [2022-12-01 23:17:05,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt + 7: [2022-12-01 23:17:05,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. +21: [2022-12-01 23:17:05,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. + 9: [2022-12-01 23:17:05,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +21: [2022-12-01 23:17:05,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt +21: [2022-12-01 23:17:05,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 7: [2022-12-01 23:17:05,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt + 7: [2022-12-01 23:17:05,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +23: [2022-12-01 23:17:05,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. +23: [2022-12-01 23:17:05,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt +23: [2022-12-01 23:17:05,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +39: [2022-12-01 23:17:05,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. +39: [2022-12-01 23:17:05,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt +39: [2022-12-01 23:17:05,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +46: [2022-12-01 23:17:05,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. +46: [2022-12-01 23:17:05,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt +46: [2022-12-01 23:17:05,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +37: [2022-12-01 23:17:05,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. +37: [2022-12-01 23:17:05,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt +37: [2022-12-01 23:17:05,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +31: [2022-12-01 23:17:05,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. +31: [2022-12-01 23:17:05,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt +31: [2022-12-01 23:17:05,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +42: [2022-12-01 23:17:05,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. +42: [2022-12-01 23:17:05,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt +42: [2022-12-01 23:17:05,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +48: [2022-12-01 23:17:05,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. +48: [2022-12-01 23:17:05,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt +48: [2022-12-01 23:17:05,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +22: [2022-12-01 23:17:05,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. +22: [2022-12-01 23:17:05,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt +22: [2022-12-01 23:17:05,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 7: [2022-12-01 23:17:05,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. + 7: [2022-12-01 23:17:05,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt + 7: [2022-12-01 23:17:05,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +46: [2022-12-01 23:17:05,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. +46: [2022-12-01 23:17:05,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt +46: [2022-12-01 23:17:05,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 2: [2022-12-01 23:17:05,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +38: [2022-12-01 23:17:05,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. + 2: [2022-12-01 23:17:05,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +38: [2022-12-01 23:17:05,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt +38: [2022-12-01 23:17:05,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +48: [2022-12-01 23:17:05,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. +48: [2022-12-01 23:17:05,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt +48: [2022-12-01 23:17:05,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +40: [2022-12-01 23:17:05,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. +40: [2022-12-01 23:17:05,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt +40: [2022-12-01 23:17:05,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +31: [2022-12-01 23:17:05,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. +31: [2022-12-01 23:17:05,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt +31: [2022-12-01 23:17:05,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +53: [2022-12-01 23:17:05,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. +53: [2022-12-01 23:17:05,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt +53: [2022-12-01 23:17:05,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +51: [2022-12-01 23:17:05,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. +51: [2022-12-01 23:17:05,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt +51: [2022-12-01 23:17:05,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +21: [2022-12-01 23:17:05,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. +21: [2022-12-01 23:17:05,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt +21: [2022-12-01 23:17:05,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +47: [2022-12-01 23:17:05,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. +47: [2022-12-01 23:17:05,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt +47: [2022-12-01 23:17:05,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +24: [2022-12-01 23:17:05,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. +24: [2022-12-01 23:17:05,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt +24: [2022-12-01 23:17:05,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +44: [2022-12-01 23:17:05,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. +44: [2022-12-01 23:17:05,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt +44: [2022-12-01 23:17:05,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. + 1: [2022-12-01 23:17:05,939] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt + 1: [2022-12-01 23:17:05,939] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 1: [2022-12-01 23:17:05,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. + 1: [2022-12-01 23:17:05,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt + 1: [2022-12-01 23:17:05,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +44: [2022-12-01 23:17:05,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. +44: [2022-12-01 23:17:05,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt +44: [2022-12-01 23:17:05,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +43: [2022-12-01 23:17:06,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. +43: [2022-12-01 23:17:06,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step3000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt +43: [2022-12-01 23:17:06,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! + 0: successfully saved checkpoint at iteration 3000 to checkpoints_8b7beta +63: time (ms) | save-checkpoint: 7166.16 +63: iteration 3010/ 5494 | consumed samples: 3082240 | consumed tokens: 6312427520 | elapsed time per iteration (s): 6.39 | learning rate: 9.780E-05 | global batch size: 1024 | lm loss: 2.260832E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 160.138 | TFLOPs: 35.80 | +63: iteration 3020/ 5494 | consumed samples: 3092480 | consumed tokens: 6333399040 | elapsed time per iteration (s): 5.64 | learning rate: 9.728E-05 | global batch size: 1024 | lm loss: 2.281606E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.589 | TFLOPs: 40.60 | +63: iteration 3030/ 5494 | consumed samples: 3102720 | consumed tokens: 6354370560 | elapsed time per iteration (s): 5.38 | learning rate: 9.677E-05 | global batch size: 1024 | lm loss: 2.266140E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.287 | TFLOPs: 42.54 | +63: iteration 3040/ 5494 | consumed samples: 3112960 | consumed tokens: 6375342080 | elapsed time per iteration (s): 5.98 | learning rate: 9.625E-05 | global batch size: 1024 | lm loss: 2.284115E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.234 | TFLOPs: 38.28 | +63: iteration 3050/ 5494 | consumed samples: 3123200 | consumed tokens: 6396313600 | elapsed time per iteration (s): 5.71 | learning rate: 9.574E-05 | global batch size: 1024 | lm loss: 2.255386E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.179 | TFLOPs: 40.06 | +63: iteration 3060/ 5494 | consumed samples: 3133440 | consumed tokens: 6417285120 | elapsed time per iteration (s): 5.70 | learning rate: 9.523E-05 | global batch size: 1024 | lm loss: 2.269578E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.545 | TFLOPs: 40.14 | +63: iteration 3070/ 5494 | consumed samples: 3143680 | consumed tokens: 6438256640 | elapsed time per iteration (s): 5.67 | learning rate: 9.472E-05 | global batch size: 1024 | lm loss: 2.273429E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.537 | TFLOPs: 40.36 | +63: iteration 3080/ 5494 | consumed samples: 3153920 | consumed tokens: 6459228160 | elapsed time per iteration (s): 5.52 | learning rate: 9.420E-05 | global batch size: 1024 | lm loss: 2.272892E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.640 | TFLOPs: 41.50 | +63: iteration 3090/ 5494 | consumed samples: 3164160 | consumed tokens: 6480199680 | elapsed time per iteration (s): 8.62 | learning rate: 9.369E-05 | global batch size: 1024 | lm loss: 2.278628E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 118.729 | TFLOPs: 26.54 | +63: iteration 3100/ 5494 | consumed samples: 3174400 | consumed tokens: 6501171200 | elapsed time per iteration (s): 5.90 | learning rate: 9.318E-05 | global batch size: 1024 | lm loss: 2.266628E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.552 | TFLOPs: 38.80 | +63: iteration 3110/ 5494 | consumed samples: 3184640 | consumed tokens: 6522142720 | elapsed time per iteration (s): 5.50 | learning rate: 9.267E-05 | global batch size: 1024 | lm loss: 2.271341E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.064 | TFLOPs: 41.60 | +63: iteration 3120/ 5494 | consumed samples: 3194880 | consumed tokens: 6543114240 | elapsed time per iteration (s): 5.63 | learning rate: 9.216E-05 | global batch size: 1024 | lm loss: 2.268076E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.851 | TFLOPs: 40.66 | +63: iteration 3130/ 5494 | consumed samples: 3205120 | consumed tokens: 6564085760 | elapsed time per iteration (s): 5.47 | learning rate: 9.165E-05 | global batch size: 1024 | lm loss: 2.251429E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.238 | TFLOPs: 41.86 | +63: iteration 3140/ 5494 | consumed samples: 3215360 | consumed tokens: 6585057280 | elapsed time per iteration (s): 6.06 | learning rate: 9.114E-05 | global batch size: 1024 | lm loss: 2.263458E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.898 | TFLOPs: 37.76 | +63: iteration 3150/ 5494 | consumed samples: 3225600 | consumed tokens: 6606028800 | elapsed time per iteration (s): 5.58 | learning rate: 9.063E-05 | global batch size: 1024 | lm loss: 2.267832E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.666 | TFLOPs: 41.06 | +63: iteration 3160/ 5494 | consumed samples: 3235840 | consumed tokens: 6627000320 | elapsed time per iteration (s): 5.62 | learning rate: 9.013E-05 | global batch size: 1024 | lm loss: 2.260285E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.092 | TFLOPs: 40.71 | +63: iteration 3170/ 5494 | consumed samples: 3246080 | consumed tokens: 6647971840 | elapsed time per iteration (s): 5.38 | learning rate: 8.962E-05 | global batch size: 1024 | lm loss: 2.258984E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.485 | TFLOPs: 42.59 | +63: iteration 3180/ 5494 | consumed samples: 3256320 | consumed tokens: 6668943360 | elapsed time per iteration (s): 5.51 | learning rate: 8.911E-05 | global batch size: 1024 | lm loss: 2.249225E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.732 | TFLOPs: 41.52 | +63: iteration 3190/ 5494 | consumed samples: 3266560 | consumed tokens: 6689914880 | elapsed time per iteration (s): 5.49 | learning rate: 8.861E-05 | global batch size: 1024 | lm loss: 2.256877E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.388 | TFLOPs: 41.67 | +63: iteration 3200/ 5494 | consumed samples: 3276800 | consumed tokens: 6710886400 | elapsed time per iteration (s): 5.39 | learning rate: 8.810E-05 | global batch size: 1024 | lm loss: 2.254709E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.010 | TFLOPs: 42.48 | +63: iteration 3210/ 5494 | consumed samples: 3287040 | consumed tokens: 6731857920 | elapsed time per iteration (s): 5.50 | learning rate: 8.760E-05 | global batch size: 1024 | lm loss: 2.253510E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.347 | TFLOPs: 41.66 | +63: iteration 3220/ 5494 | consumed samples: 3297280 | consumed tokens: 6752829440 | elapsed time per iteration (s): 5.65 | learning rate: 8.710E-05 | global batch size: 1024 | lm loss: 2.240266E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.355 | TFLOPs: 40.54 | +63: iteration 3230/ 5494 | consumed samples: 3307520 | consumed tokens: 6773800960 | elapsed time per iteration (s): 5.65 | learning rate: 8.660E-05 | global batch size: 1024 | lm loss: 2.264645E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.274 | TFLOPs: 40.53 | +63: iteration 3240/ 5494 | consumed samples: 3317760 | consumed tokens: 6794772480 | elapsed time per iteration (s): 5.52 | learning rate: 8.609E-05 | global batch size: 1024 | lm loss: 2.249137E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.667 | TFLOPs: 41.51 | +63: iteration 3250/ 5494 | consumed samples: 3328000 | consumed tokens: 6815744000 | elapsed time per iteration (s): 5.48 | learning rate: 8.559E-05 | global batch size: 1024 | lm loss: 2.246150E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.880 | TFLOPs: 41.78 | +63: iteration 3260/ 5494 | consumed samples: 3338240 | consumed tokens: 6836715520 | elapsed time per iteration (s): 5.72 | learning rate: 8.509E-05 | global batch size: 1024 | lm loss: 2.248843E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.073 | TFLOPs: 40.03 | +63: iteration 3270/ 5494 | consumed samples: 3348480 | consumed tokens: 6857687040 | elapsed time per iteration (s): 5.55 | learning rate: 8.459E-05 | global batch size: 1024 | lm loss: 2.246715E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.361 | TFLOPs: 41.22 | +63: iteration 3280/ 5494 | consumed samples: 3358720 | consumed tokens: 6878658560 | elapsed time per iteration (s): 5.61 | learning rate: 8.410E-05 | global batch size: 1024 | lm loss: 2.252427E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.453 | TFLOPs: 40.79 | +63: iteration 3290/ 5494 | consumed samples: 3368960 | consumed tokens: 6899630080 | elapsed time per iteration (s): 5.61 | learning rate: 8.360E-05 | global batch size: 1024 | lm loss: 2.237853E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.525 | TFLOPs: 40.81 | +63: iteration 3300/ 5494 | consumed samples: 3379200 | consumed tokens: 6920601600 | elapsed time per iteration (s): 5.40 | learning rate: 8.310E-05 | global batch size: 1024 | lm loss: 2.240268E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.484 | TFLOPs: 42.36 | +63: iteration 3310/ 5494 | consumed samples: 3389440 | consumed tokens: 6941573120 | elapsed time per iteration (s): 5.62 | learning rate: 8.261E-05 | global batch size: 1024 | lm loss: 2.244103E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.066 | TFLOPs: 40.70 | +63: iteration 3320/ 5494 | consumed samples: 3399680 | consumed tokens: 6962544640 | elapsed time per iteration (s): 6.05 | learning rate: 8.211E-05 | global batch size: 1024 | lm loss: 2.248048E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.269 | TFLOPs: 37.84 | +63: iteration 3330/ 5494 | consumed samples: 3409920 | consumed tokens: 6983516160 | elapsed time per iteration (s): 5.86 | learning rate: 8.162E-05 | global batch size: 1024 | lm loss: 2.251892E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.679 | TFLOPs: 39.05 | +63: iteration 3340/ 5494 | consumed samples: 3420160 | consumed tokens: 7004487680 | elapsed time per iteration (s): 5.70 | learning rate: 8.113E-05 | global batch size: 1024 | lm loss: 2.239099E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.791 | TFLOPs: 40.20 | +63: iteration 3350/ 5494 | consumed samples: 3430400 | consumed tokens: 7025459200 | elapsed time per iteration (s): 5.78 | learning rate: 8.063E-05 | global batch size: 1024 | lm loss: 2.243501E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.135 | TFLOPs: 39.60 | +63: iteration 3360/ 5494 | consumed samples: 3440640 | consumed tokens: 7046430720 | elapsed time per iteration (s): 5.58 | learning rate: 8.014E-05 | global batch size: 1024 | lm loss: 2.253595E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.609 | TFLOPs: 41.05 | +63: iteration 3370/ 5494 | consumed samples: 3450880 | consumed tokens: 7067402240 | elapsed time per iteration (s): 5.83 | learning rate: 7.965E-05 | global batch size: 1024 | lm loss: 2.240745E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.560 | TFLOPs: 39.25 | +63: iteration 3380/ 5494 | consumed samples: 3461120 | consumed tokens: 7088373760 | elapsed time per iteration (s): 5.66 | learning rate: 7.916E-05 | global batch size: 1024 | lm loss: 2.239463E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.864 | TFLOPs: 40.44 | +63: iteration 3390/ 5494 | consumed samples: 3471360 | consumed tokens: 7109345280 | elapsed time per iteration (s): 5.81 | learning rate: 7.868E-05 | global batch size: 1024 | lm loss: 2.241297E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.276 | TFLOPs: 39.41 | +63: iteration 3400/ 5494 | consumed samples: 3481600 | consumed tokens: 7130316800 | elapsed time per iteration (s): 5.89 | learning rate: 7.819E-05 | global batch size: 1024 | lm loss: 2.248381E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.943 | TFLOPs: 38.89 | +63: iteration 3410/ 5494 | consumed samples: 3491840 | consumed tokens: 7151288320 | elapsed time per iteration (s): 5.79 | learning rate: 7.770E-05 | global batch size: 1024 | lm loss: 2.232587E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.839 | TFLOPs: 39.54 | +63: iteration 3420/ 5494 | consumed samples: 3502080 | consumed tokens: 7172259840 | elapsed time per iteration (s): 5.50 | learning rate: 7.722E-05 | global batch size: 1024 | lm loss: 2.243279E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.081 | TFLOPs: 41.60 | +63: iteration 3430/ 5494 | consumed samples: 3512320 | consumed tokens: 7193231360 | elapsed time per iteration (s): 5.54 | learning rate: 7.674E-05 | global batch size: 1024 | lm loss: 2.239728E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.942 | TFLOPs: 41.35 | +63: iteration 3440/ 5494 | consumed samples: 3522560 | consumed tokens: 7214202880 | elapsed time per iteration (s): 5.78 | learning rate: 7.625E-05 | global batch size: 1024 | lm loss: 2.236959E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.312 | TFLOPs: 39.64 | +63: iteration 3450/ 5494 | consumed samples: 3532800 | consumed tokens: 7235174400 | elapsed time per iteration (s): 5.42 | learning rate: 7.577E-05 | global batch size: 1024 | lm loss: 2.246286E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.042 | TFLOPs: 42.26 | +63: iteration 3460/ 5494 | consumed samples: 3543040 | consumed tokens: 7256145920 | elapsed time per iteration (s): 5.72 | learning rate: 7.529E-05 | global batch size: 1024 | lm loss: 2.229983E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.158 | TFLOPs: 40.05 | +63: iteration 3470/ 5494 | consumed samples: 3553280 | consumed tokens: 7277117440 | elapsed time per iteration (s): 5.55 | learning rate: 7.481E-05 | global batch size: 1024 | lm loss: 2.242155E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.528 | TFLOPs: 41.25 | +63: iteration 3480/ 5494 | consumed samples: 3563520 | consumed tokens: 7298088960 | elapsed time per iteration (s): 5.66 | learning rate: 7.433E-05 | global batch size: 1024 | lm loss: 2.227739E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.794 | TFLOPs: 40.42 | +63: iteration 3490/ 5494 | consumed samples: 3573760 | consumed tokens: 7319060480 | elapsed time per iteration (s): 5.61 | learning rate: 7.386E-05 | global batch size: 1024 | lm loss: 2.241823E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.491 | TFLOPs: 40.80 | +63: iteration 3500/ 5494 | consumed samples: 3584000 | consumed tokens: 7340032000 | elapsed time per iteration (s): 5.65 | learning rate: 7.338E-05 | global batch size: 1024 | lm loss: 2.216797E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.279 | TFLOPs: 40.53 | +63: iteration 3510/ 5494 | consumed samples: 3594240 | consumed tokens: 7361003520 | elapsed time per iteration (s): 5.43 | learning rate: 7.291E-05 | global batch size: 1024 | lm loss: 2.244756E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.596 | TFLOPs: 42.16 | +63: iteration 3520/ 5494 | consumed samples: 3604480 | consumed tokens: 7381975040 | elapsed time per iteration (s): 5.61 | learning rate: 7.243E-05 | global batch size: 1024 | lm loss: 2.215876E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.410 | TFLOPs: 40.78 | +63: iteration 3530/ 5494 | consumed samples: 3614720 | consumed tokens: 7402946560 | elapsed time per iteration (s): 5.64 | learning rate: 7.196E-05 | global batch size: 1024 | lm loss: 2.227877E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.678 | TFLOPs: 40.62 | +63: iteration 3540/ 5494 | consumed samples: 3624960 | consumed tokens: 7423918080 | elapsed time per iteration (s): 5.61 | learning rate: 7.149E-05 | global batch size: 1024 | lm loss: 2.241483E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.657 | TFLOPs: 40.84 | +63: iteration 3550/ 5494 | consumed samples: 3635200 | consumed tokens: 7444889600 | elapsed time per iteration (s): 5.94 | learning rate: 7.102E-05 | global batch size: 1024 | lm loss: 2.235342E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.296 | TFLOPs: 38.52 | +63: iteration 3560/ 5494 | consumed samples: 3645440 | consumed tokens: 7465861120 | elapsed time per iteration (s): 5.71 | learning rate: 7.056E-05 | global batch size: 1024 | lm loss: 2.226452E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.353 | TFLOPs: 40.10 | +63: iteration 3570/ 5494 | consumed samples: 3655680 | consumed tokens: 7486832640 | elapsed time per iteration (s): 5.80 | learning rate: 7.009E-05 | global batch size: 1024 | lm loss: 2.228720E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.525 | TFLOPs: 39.47 | +63: iteration 3580/ 5494 | consumed samples: 3665920 | consumed tokens: 7507804160 | elapsed time per iteration (s): 5.52 | learning rate: 6.962E-05 | global batch size: 1024 | lm loss: 2.199792E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.345 | TFLOPs: 41.44 | +63: iteration 3590/ 5494 | consumed samples: 3676160 | consumed tokens: 7528775680 | elapsed time per iteration (s): 5.52 | learning rate: 6.916E-05 | global batch size: 1024 | lm loss: 2.233200E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.366 | TFLOPs: 41.44 | +63: iteration 3600/ 5494 | consumed samples: 3686400 | consumed tokens: 7549747200 | elapsed time per iteration (s): 5.61 | learning rate: 6.870E-05 | global batch size: 1024 | lm loss: 2.220530E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.464 | TFLOPs: 40.79 | +63: iteration 3610/ 5494 | consumed samples: 3696640 | consumed tokens: 7570718720 | elapsed time per iteration (s): 5.61 | learning rate: 6.824E-05 | global batch size: 1024 | lm loss: 2.204096E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.388 | TFLOPs: 40.78 | +63: iteration 3620/ 5494 | consumed samples: 3706880 | consumed tokens: 7591690240 | elapsed time per iteration (s): 5.73 | learning rate: 6.778E-05 | global batch size: 1024 | lm loss: 2.220774E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.823 | TFLOPs: 39.98 | +63: iteration 3630/ 5494 | consumed samples: 3717120 | consumed tokens: 7612661760 | elapsed time per iteration (s): 5.65 | learning rate: 6.732E-05 | global batch size: 1024 | lm loss: 2.231177E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.161 | TFLOPs: 40.50 | +63: iteration 3640/ 5494 | consumed samples: 3727360 | consumed tokens: 7633633280 | elapsed time per iteration (s): 5.62 | learning rate: 6.686E-05 | global batch size: 1024 | lm loss: 2.211363E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.200 | TFLOPs: 40.73 | +63: iteration 3650/ 5494 | consumed samples: 3737600 | consumed tokens: 7654604800 | elapsed time per iteration (s): 5.77 | learning rate: 6.641E-05 | global batch size: 1024 | lm loss: 2.217170E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.531 | TFLOPs: 39.69 | +63: iteration 3660/ 5494 | consumed samples: 3747840 | consumed tokens: 7675576320 | elapsed time per iteration (s): 5.53 | learning rate: 6.595E-05 | global batch size: 1024 | lm loss: 2.211649E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.302 | TFLOPs: 41.43 | +63: iteration 3670/ 5494 | consumed samples: 3758080 | consumed tokens: 7696547840 | elapsed time per iteration (s): 5.71 | learning rate: 6.550E-05 | global batch size: 1024 | lm loss: 2.213702E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.336 | TFLOPs: 40.09 | +63: iteration 3680/ 5494 | consumed samples: 3768320 | consumed tokens: 7717519360 | elapsed time per iteration (s): 5.63 | learning rate: 6.505E-05 | global batch size: 1024 | lm loss: 2.214808E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.764 | TFLOPs: 40.64 | +63: iteration 3690/ 5494 | consumed samples: 3778560 | consumed tokens: 7738490880 | elapsed time per iteration (s): 5.53 | learning rate: 6.460E-05 | global batch size: 1024 | lm loss: 2.214012E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.030 | TFLOPs: 41.37 | +63: iteration 3700/ 5494 | consumed samples: 3788800 | consumed tokens: 7759462400 | elapsed time per iteration (s): 6.13 | learning rate: 6.415E-05 | global batch size: 1024 | lm loss: 2.218436E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.174 | TFLOPs: 37.37 | +63: iteration 3710/ 5494 | consumed samples: 3799040 | consumed tokens: 7780433920 | elapsed time per iteration (s): 5.50 | learning rate: 6.370E-05 | global batch size: 1024 | lm loss: 2.213525E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.021 | TFLOPs: 41.59 | +63: iteration 3720/ 5494 | consumed samples: 3809280 | consumed tokens: 7801405440 | elapsed time per iteration (s): 5.64 | learning rate: 6.326E-05 | global batch size: 1024 | lm loss: 2.206483E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.560 | TFLOPs: 40.59 | +63: iteration 3730/ 5494 | consumed samples: 3819520 | consumed tokens: 7822376960 | elapsed time per iteration (s): 5.53 | learning rate: 6.282E-05 | global batch size: 1024 | lm loss: 2.197027E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.302 | TFLOPs: 41.43 | +63: iteration 3740/ 5494 | consumed samples: 3829760 | consumed tokens: 7843348480 | elapsed time per iteration (s): 5.74 | learning rate: 6.237E-05 | global batch size: 1024 | lm loss: 2.189925E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.390 | TFLOPs: 39.88 | +63: iteration 3750/ 5494 | consumed samples: 3840000 | consumed tokens: 7864320000 | elapsed time per iteration (s): 5.39 | learning rate: 6.193E-05 | global batch size: 1024 | lm loss: 2.214748E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.863 | TFLOPs: 42.45 | +63: iteration 3760/ 5494 | consumed samples: 3850240 | consumed tokens: 7885291520 | elapsed time per iteration (s): 5.62 | learning rate: 6.149E-05 | global batch size: 1024 | lm loss: 2.218872E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.245 | TFLOPs: 40.74 | +63: iteration 3770/ 5494 | consumed samples: 3860480 | consumed tokens: 7906263040 | elapsed time per iteration (s): 5.73 | learning rate: 6.106E-05 | global batch size: 1024 | lm loss: 2.192256E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.843 | TFLOPs: 39.98 | +63: iteration 3780/ 5494 | consumed samples: 3870720 | consumed tokens: 7927234560 | elapsed time per iteration (s): 5.62 | learning rate: 6.062E-05 | global batch size: 1024 | lm loss: 2.200215E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.139 | TFLOPs: 40.72 | +63: iteration 3790/ 5494 | consumed samples: 3880960 | consumed tokens: 7948206080 | elapsed time per iteration (s): 5.93 | learning rate: 6.019E-05 | global batch size: 1024 | lm loss: 2.199339E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.634 | TFLOPs: 38.60 | +63: iteration 3800/ 5494 | consumed samples: 3891200 | consumed tokens: 7969177600 | elapsed time per iteration (s): 5.78 | learning rate: 5.976E-05 | global batch size: 1024 | lm loss: 2.208147E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.152 | TFLOPs: 39.61 | +63: iteration 3810/ 5494 | consumed samples: 3901440 | consumed tokens: 7990149120 | elapsed time per iteration (s): 5.50 | learning rate: 5.933E-05 | global batch size: 1024 | lm loss: 2.208112E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.095 | TFLOPs: 41.60 | +63: iteration 3820/ 5494 | consumed samples: 3911680 | consumed tokens: 8011120640 | elapsed time per iteration (s): 5.77 | learning rate: 5.890E-05 | global batch size: 1024 | lm loss: 2.205000E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.556 | TFLOPs: 39.70 | +63: iteration 3830/ 5494 | consumed samples: 3921920 | consumed tokens: 8032092160 | elapsed time per iteration (s): 5.54 | learning rate: 5.847E-05 | global batch size: 1024 | lm loss: 2.197479E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.737 | TFLOPs: 41.30 | +63: iteration 3840/ 5494 | consumed samples: 3932160 | consumed tokens: 8053063680 | elapsed time per iteration (s): 5.38 | learning rate: 5.804E-05 | global batch size: 1024 | lm loss: 2.201151E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.206 | TFLOPs: 42.52 | +63: iteration 3850/ 5494 | consumed samples: 3942400 | consumed tokens: 8074035200 | elapsed time per iteration (s): 5.74 | learning rate: 5.762E-05 | global batch size: 1024 | lm loss: 2.199107E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.527 | TFLOPs: 39.91 | +63: iteration 3860/ 5494 | consumed samples: 3952640 | consumed tokens: 8095006720 | elapsed time per iteration (s): 5.62 | learning rate: 5.720E-05 | global batch size: 1024 | lm loss: 2.210650E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.234 | TFLOPs: 40.74 | +63: iteration 3870/ 5494 | consumed samples: 3962880 | consumed tokens: 8115978240 | elapsed time per iteration (s): 5.55 | learning rate: 5.678E-05 | global batch size: 1024 | lm loss: 2.207199E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.603 | TFLOPs: 41.27 | +63: iteration 3880/ 5494 | consumed samples: 3973120 | consumed tokens: 8136949760 | elapsed time per iteration (s): 5.61 | learning rate: 5.636E-05 | global batch size: 1024 | lm loss: 2.202895E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.448 | TFLOPs: 40.79 | +63: iteration 3890/ 5494 | consumed samples: 3983360 | consumed tokens: 8157921280 | elapsed time per iteration (s): 5.78 | learning rate: 5.594E-05 | global batch size: 1024 | lm loss: 2.203499E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.296 | TFLOPs: 39.64 | +63: iteration 3900/ 5494 | consumed samples: 3993600 | consumed tokens: 8178892800 | elapsed time per iteration (s): 5.44 | learning rate: 5.553E-05 | global batch size: 1024 | lm loss: 2.203354E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.118 | TFLOPs: 42.06 | +63: iteration 3910/ 5494 | consumed samples: 4003840 | consumed tokens: 8199864320 | elapsed time per iteration (s): 5.52 | learning rate: 5.512E-05 | global batch size: 1024 | lm loss: 2.203023E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.652 | TFLOPs: 41.51 | +63: iteration 3920/ 5494 | consumed samples: 4014080 | consumed tokens: 8220835840 | elapsed time per iteration (s): 5.65 | learning rate: 5.471E-05 | global batch size: 1024 | lm loss: 2.198124E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.367 | TFLOPs: 40.55 | +63: iteration 3930/ 5494 | consumed samples: 4024320 | consumed tokens: 8241807360 | elapsed time per iteration (s): 5.38 | learning rate: 5.430E-05 | global batch size: 1024 | lm loss: 2.214252E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.294 | TFLOPs: 42.54 | +63: iteration 3940/ 5494 | consumed samples: 4034560 | consumed tokens: 8262778880 | elapsed time per iteration (s): 5.60 | learning rate: 5.389E-05 | global batch size: 1024 | lm loss: 2.204668E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.004 | TFLOPs: 40.91 | +63: iteration 3950/ 5494 | consumed samples: 4044800 | consumed tokens: 8283750400 | elapsed time per iteration (s): 6.05 | learning rate: 5.348E-05 | global batch size: 1024 | lm loss: 2.186449E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.135 | TFLOPs: 37.81 | +63: iteration 3960/ 5494 | consumed samples: 4055040 | consumed tokens: 8304721920 | elapsed time per iteration (s): 5.50 | learning rate: 5.308E-05 | global batch size: 1024 | lm loss: 2.197269E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.035 | TFLOPs: 41.59 | +63: iteration 3970/ 5494 | consumed samples: 4065280 | consumed tokens: 8325693440 | elapsed time per iteration (s): 5.74 | learning rate: 5.268E-05 | global batch size: 1024 | lm loss: 2.192209E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.475 | TFLOPs: 39.90 | +63: iteration 3980/ 5494 | consumed samples: 4075520 | consumed tokens: 8346664960 | elapsed time per iteration (s): 5.75 | learning rate: 5.228E-05 | global batch size: 1024 | lm loss: 2.192127E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.961 | TFLOPs: 39.79 | +63: iteration 3990/ 5494 | consumed samples: 4085760 | consumed tokens: 8367636480 | elapsed time per iteration (s): 5.39 | learning rate: 5.188E-05 | global batch size: 1024 | lm loss: 2.191568E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.116 | TFLOPs: 42.50 | + 0: [2022-12-02 00:51:32,225] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[5.148437789292599e-05, 5.148437789292599e-05, 5.148437789292599e-05], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +63: iteration 4000/ 5494 | consumed samples: 4096000 | consumed tokens: 8388608000 | elapsed time per iteration (s): 5.38 | learning rate: 5.148E-05 | global batch size: 1024 | lm loss: 2.204333E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.308 | TFLOPs: 42.55 | + 0: steps: 4000 loss: 2.2125 iter time (s): 5.666 samples/sec: 180.732 +63: ------------------------------------------------------------------------------------------ +63: valid loss at iteration 4000 | lm loss value: 2.152087E+00 | lm loss PPL: 8.602798E+00 | +63: ------------------------------------------------------------------------------------------ + 0: saving checkpoint at iteration 4000 to checkpoints_8b7beta + 0: [2022-12-02 00:51:34,299] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! + 0: [2022-12-02 00:51:34,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_01-model_01-model_states.pt... + 0: [2022-12-02 00:51:34,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_01-model_00-model_states.pt... +32: [2022-12-02 00:51:34,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_24-model_01-model_states.pt... +32: [2022-12-02 00:51:34,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_24-model_00-model_states.pt... + 0: [2022-12-02 00:51:35,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_01-model_00-model_states.pt. + 0: [2022-12-02 00:51:35,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_01-model_01-model_states.pt. +32: [2022-12-02 00:51:35,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_24-model_00-model_states.pt. +32: [2022-12-02 00:51:35,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_24-model_01-model_states.pt. + 0: [2022-12-02 00:51:35,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_03-model_01-model_states.pt... + 0: [2022-12-02 00:51:35,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_03-model_00-model_states.pt... +32: [2022-12-02 00:51:35,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_25-model_01-model_states.pt... +32: [2022-12-02 00:51:35,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_25-model_00-model_states.pt... + 0: [2022-12-02 00:51:35,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_03-model_01-model_states.pt. + 0: [2022-12-02 00:51:35,271] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_04-model_01-model_states.pt... + 0: [2022-12-02 00:51:35,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_03-model_00-model_states.pt. + 0: [2022-12-02 00:51:35,286] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_04-model_00-model_states.pt... +32: [2022-12-02 00:51:35,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_25-model_00-model_states.pt. +32: [2022-12-02 00:51:35,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_25-model_01-model_states.pt. +32: [2022-12-02 00:51:35,290] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_26-model_00-model_states.pt... +32: [2022-12-02 00:51:35,290] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_26-model_01-model_states.pt... + 0: [2022-12-02 00:51:35,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_04-model_01-model_states.pt. + 0: [2022-12-02 00:51:35,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_05-model_01-model_states.pt... + 0: [2022-12-02 00:51:35,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_04-model_00-model_states.pt. + 0: [2022-12-02 00:51:35,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_05-model_00-model_states.pt... +32: [2022-12-02 00:51:35,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_26-model_01-model_states.pt. +32: [2022-12-02 00:51:35,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_26-model_00-model_states.pt. +32: [2022-12-02 00:51:35,565] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_27-model_01-model_states.pt... +32: [2022-12-02 00:51:35,566] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_27-model_00-model_states.pt... + 0: [2022-12-02 00:51:35,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_05-model_01-model_states.pt. + 0: [2022-12-02 00:51:35,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_06-model_01-model_states.pt... + 0: [2022-12-02 00:51:35,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_05-model_00-model_states.pt. + 0: [2022-12-02 00:51:35,795] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_06-model_00-model_states.pt... +32: [2022-12-02 00:51:35,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_27-model_01-model_states.pt. +32: [2022-12-02 00:51:35,822] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_28-model_01-model_states.pt... +32: [2022-12-02 00:51:35,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_27-model_00-model_states.pt. +32: [2022-12-02 00:51:35,834] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_28-model_00-model_states.pt... + 0: [2022-12-02 00:51:35,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_06-model_01-model_states.pt. + 0: [2022-12-02 00:51:35,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_07-model_01-model_states.pt... + 0: [2022-12-02 00:51:36,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_06-model_00-model_states.pt. + 0: [2022-12-02 00:51:36,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_07-model_00-model_states.pt... +32: [2022-12-02 00:51:36,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_28-model_00-model_states.pt. +32: [2022-12-02 00:51:36,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_29-model_00-model_states.pt... +32: [2022-12-02 00:51:36,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_28-model_01-model_states.pt. +32: [2022-12-02 00:51:36,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_29-model_01-model_states.pt... + 0: [2022-12-02 00:51:36,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_07-model_01-model_states.pt. + 0: [2022-12-02 00:51:36,216] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_08-model_01-model_states.pt... + 0: [2022-12-02 00:51:36,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_07-model_00-model_states.pt. + 0: [2022-12-02 00:51:36,276] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_08-model_00-model_states.pt... +32: [2022-12-02 00:51:36,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_29-model_00-model_states.pt. +32: [2022-12-02 00:51:36,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_30-model_00-model_states.pt... +32: [2022-12-02 00:51:36,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_29-model_01-model_states.pt. +32: [2022-12-02 00:51:36,344] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_30-model_01-model_states.pt... + 0: [2022-12-02 00:51:36,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_08-model_01-model_states.pt. + 0: [2022-12-02 00:51:36,444] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_09-model_01-model_states.pt... + 0: [2022-12-02 00:51:36,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_08-model_00-model_states.pt. + 0: [2022-12-02 00:51:36,546] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_09-model_00-model_states.pt... +32: [2022-12-02 00:51:36,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_30-model_01-model_states.pt. +32: [2022-12-02 00:51:36,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_30-model_00-model_states.pt. +32: [2022-12-02 00:51:36,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_31-model_01-model_states.pt... +32: [2022-12-02 00:51:36,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_31-model_00-model_states.pt... + 0: [2022-12-02 00:51:36,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_09-model_01-model_states.pt. + 0: [2022-12-02 00:51:36,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_10-model_01-model_states.pt... + 0: [2022-12-02 00:51:36,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_09-model_00-model_states.pt. + 0: [2022-12-02 00:51:36,810] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_10-model_00-model_states.pt... +32: [2022-12-02 00:51:36,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_31-model_00-model_states.pt. +32: [2022-12-02 00:51:36,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_32-model_00-model_states.pt... +32: [2022-12-02 00:51:36,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_31-model_01-model_states.pt. +32: [2022-12-02 00:51:36,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_32-model_01-model_states.pt... + 0: [2022-12-02 00:51:36,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_10-model_01-model_states.pt. + 0: [2022-12-02 00:51:36,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_11-model_01-model_states.pt... + 0: [2022-12-02 00:51:37,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_11-model_01-model_states.pt. + 0: [2022-12-02 00:51:37,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_12-model_01-model_states.pt... +32: [2022-12-02 00:51:37,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_32-model_01-model_states.pt. +32: [2022-12-02 00:51:37,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_32-model_00-model_states.pt. +32: [2022-12-02 00:51:37,156] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_33-model_01-model_states.pt... +32: [2022-12-02 00:51:37,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_33-model_00-model_states.pt... + 0: [2022-12-02 00:51:37,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_10-model_00-model_states.pt. + 0: [2022-12-02 00:51:37,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_11-model_00-model_states.pt... + 0: [2022-12-02 00:51:37,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_12-model_01-model_states.pt. + 0: [2022-12-02 00:51:37,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_13-model_01-model_states.pt... +32: [2022-12-02 00:51:37,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_33-model_01-model_states.pt. +32: [2022-12-02 00:51:37,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_34-model_01-model_states.pt... +32: [2022-12-02 00:51:37,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_33-model_00-model_states.pt. +32: [2022-12-02 00:51:37,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_34-model_00-model_states.pt... + 0: [2022-12-02 00:51:37,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_13-model_01-model_states.pt. + 0: [2022-12-02 00:51:37,586] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_14-model_01-model_states.pt... +32: [2022-12-02 00:51:37,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_34-model_01-model_states.pt. +32: [2022-12-02 00:51:37,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_35-model_01-model_states.pt... +32: [2022-12-02 00:51:37,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_34-model_00-model_states.pt. +32: [2022-12-02 00:51:37,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_35-model_00-model_states.pt... + 0: [2022-12-02 00:51:37,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_11-model_00-model_states.pt. + 0: [2022-12-02 00:51:37,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_12-model_00-model_states.pt... + 0: [2022-12-02 00:51:37,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_14-model_01-model_states.pt. + 0: [2022-12-02 00:51:37,811] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_15-model_01-model_states.pt... +32: [2022-12-02 00:51:37,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_35-model_00-model_states.pt. +32: [2022-12-02 00:51:37,930] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_36-model_00-model_states.pt... +32: [2022-12-02 00:51:37,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_35-model_01-model_states.pt. +32: [2022-12-02 00:51:37,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_36-model_01-model_states.pt... + 0: [2022-12-02 00:51:38,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_15-model_01-model_states.pt. + 0: [2022-12-02 00:51:38,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_16-model_01-model_states.pt... + 0: [2022-12-02 00:51:38,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_12-model_00-model_states.pt. + 0: [2022-12-02 00:51:38,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_13-model_00-model_states.pt... +32: [2022-12-02 00:51:38,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_36-model_01-model_states.pt. +32: [2022-12-02 00:51:38,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_37-model_01-model_states.pt... +32: [2022-12-02 00:51:38,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_36-model_00-model_states.pt. +32: [2022-12-02 00:51:38,210] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_37-model_00-model_states.pt... + 0: [2022-12-02 00:51:38,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_16-model_01-model_states.pt. + 0: [2022-12-02 00:51:38,260] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_17-model_01-model_states.pt... + 0: [2022-12-02 00:51:38,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_13-model_00-model_states.pt. + 0: [2022-12-02 00:51:38,349] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_14-model_00-model_states.pt... +32: [2022-12-02 00:51:38,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_37-model_01-model_states.pt. +32: [2022-12-02 00:51:38,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_37-model_00-model_states.pt. +32: [2022-12-02 00:51:38,463] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_38-model_01-model_states.pt... +32: [2022-12-02 00:51:38,463] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_38-model_00-model_states.pt... + 0: [2022-12-02 00:51:38,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_17-model_01-model_states.pt. + 0: [2022-12-02 00:51:38,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_18-model_01-model_states.pt... + 0: [2022-12-02 00:51:38,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_14-model_00-model_states.pt. + 0: [2022-12-02 00:51:38,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_15-model_00-model_states.pt... +32: [2022-12-02 00:51:38,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_38-model_00-model_states.pt. +32: [2022-12-02 00:51:38,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_39-model_00-model_states.pt... +32: [2022-12-02 00:51:38,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_38-model_01-model_states.pt. +32: [2022-12-02 00:51:38,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_39-model_01-model_states.pt... + 0: [2022-12-02 00:51:38,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_18-model_01-model_states.pt. + 0: [2022-12-02 00:51:38,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_19-model_01-model_states.pt... + 0: [2022-12-02 00:51:38,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_15-model_00-model_states.pt. + 0: [2022-12-02 00:51:38,811] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_16-model_00-model_states.pt... +32: [2022-12-02 00:51:38,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_39-model_01-model_states.pt. +32: [2022-12-02 00:51:38,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_39-model_00-model_states.pt. +32: [2022-12-02 00:51:38,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_40-model_00-model_states.pt... +32: [2022-12-02 00:51:38,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_40-model_01-model_states.pt... + 0: [2022-12-02 00:51:38,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_19-model_01-model_states.pt. + 0: [2022-12-02 00:51:38,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_20-model_01-model_states.pt... + 0: [2022-12-02 00:51:39,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_16-model_00-model_states.pt. + 0: [2022-12-02 00:51:39,030] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_17-model_00-model_states.pt... + 0: [2022-12-02 00:51:39,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_20-model_01-model_states.pt. + 0: [2022-12-02 00:51:39,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_21-model_01-model_states.pt... + 0: [2022-12-02 00:51:39,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_17-model_00-model_states.pt. + 0: [2022-12-02 00:51:39,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_18-model_00-model_states.pt... +32: [2022-12-02 00:51:39,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_40-model_01-model_states.pt. +32: [2022-12-02 00:51:39,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_41-model_01-model_states.pt... +32: [2022-12-02 00:51:39,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_40-model_00-model_states.pt. +32: [2022-12-02 00:51:39,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_41-model_00-model_states.pt... + 0: [2022-12-02 00:51:39,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_18-model_00-model_states.pt. + 0: [2022-12-02 00:51:39,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_19-model_00-model_states.pt... + 0: [2022-12-02 00:51:39,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_21-model_01-model_states.pt. + 0: [2022-12-02 00:51:39,480] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_22-model_01-model_states.pt... +32: [2022-12-02 00:51:39,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_41-model_01-model_states.pt. +32: [2022-12-02 00:51:39,503] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_42-model_01-model_states.pt... +32: [2022-12-02 00:51:39,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_41-model_00-model_states.pt. +32: [2022-12-02 00:51:39,512] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_42-model_00-model_states.pt... + 0: [2022-12-02 00:51:39,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_22-model_01-model_states.pt. + 0: [2022-12-02 00:51:39,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_23-model_01-model_states.pt... + 0: [2022-12-02 00:51:39,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_19-model_00-model_states.pt. + 0: [2022-12-02 00:51:39,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_20-model_00-model_states.pt... +32: [2022-12-02 00:51:39,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_42-model_00-model_states.pt. +32: [2022-12-02 00:51:39,754] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_43-model_00-model_states.pt... +32: [2022-12-02 00:51:39,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_42-model_01-model_states.pt. +32: [2022-12-02 00:51:39,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_43-model_01-model_states.pt... + 0: [2022-12-02 00:51:39,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_20-model_00-model_states.pt. + 0: [2022-12-02 00:51:39,961] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_21-model_00-model_states.pt... + 0: [2022-12-02 00:51:39,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_23-model_01-model_states.pt. + 0: [2022-12-02 00:51:39,972] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7beta/global_step4000/mp_rank_01_model_states.pt + 0: [2022-12-02 00:51:39,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/mp_rank_01_model_states.pt... + 0: [2022-12-02 00:51:40,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/mp_rank_01_model_states.pt. +32: [2022-12-02 00:51:40,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_43-model_00-model_states.pt. +32: [2022-12-02 00:51:40,023] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_44-model_00-model_states.pt... +32: [2022-12-02 00:51:40,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_43-model_01-model_states.pt. +32: [2022-12-02 00:51:40,033] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_44-model_01-model_states.pt... + 0: [2022-12-02 00:51:40,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_21-model_00-model_states.pt. + 0: [2022-12-02 00:51:40,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_22-model_00-model_states.pt... +32: [2022-12-02 00:51:40,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_44-model_00-model_states.pt. +32: [2022-12-02 00:51:40,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_44-model_01-model_states.pt. +32: [2022-12-02 00:51:40,276] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_46-model_00-model_states.pt... +32: [2022-12-02 00:51:40,276] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_46-model_01-model_states.pt... +32: [2022-12-02 00:51:40,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_46-model_00-model_states.pt. +32: [2022-12-02 00:51:40,289] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/mp_rank_02_model_states.pt... +32: [2022-12-02 00:51:40,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_46-model_01-model_states.pt. +32: [2022-12-02 00:51:40,297] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/mp_rank_03_model_states.pt... +32: [2022-12-02 00:51:40,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/mp_rank_02_model_states.pt. +32: [2022-12-02 00:51:40,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/mp_rank_03_model_states.pt. + 0: [2022-12-02 00:51:40,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_22-model_00-model_states.pt. + 0: [2022-12-02 00:51:40,400] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/layer_23-model_00-model_states.pt... + 0: [2022-12-02 00:51:40,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/layer_23-model_00-model_states.pt. + 0: [2022-12-02 00:51:40,619] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7beta/global_step4000/mp_rank_00_model_states.pt + 0: [2022-12-02 00:51:40,619] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/mp_rank_00_model_states.pt... + 0: [2022-12-02 00:51:40,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/mp_rank_00_model_states.pt. +32: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... +54: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... +54: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... +54: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... +62: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... +62: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... +62: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... +62: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... +60: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... +60: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... +60: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... +60: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... +63: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... +63: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... +63: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... +63: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... +39: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... +39: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... +39: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... +39: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... +36: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... +36: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... +36: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... +58: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... +58: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... +58: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... +58: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... +34: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... +34: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... +34: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... +34: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... +32: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... +50: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... +50: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... +50: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... +50: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... +49: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... +49: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... +49: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... +49: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... +37: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... +37: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... +37: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... +37: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... +47: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... +47: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... +47: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... +47: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... +48: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... +48: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... +48: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... +48: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... +53: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... +53: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... +53: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... +53: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... +40: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... +40: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... +40: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... +40: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... + 0: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +54: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... +54: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... +56: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... +56: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... +56: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... +56: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... +62: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... +62: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... +62: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... +60: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... +60: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... +60: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... +60: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... +51: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... +51: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... +51: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... +51: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... +63: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... +63: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... +63: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... +39: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... +39: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... +39: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... + 7: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... +57: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... +57: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... +57: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... +27: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... +27: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... +43: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... +43: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... +43: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... +43: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... +13: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... +38: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... +38: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... +38: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... +36: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... +36: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... +42: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... +42: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... +42: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... +42: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... +58: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... +44: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... +44: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... +44: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... +44: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... +34: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... +34: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... +34: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... +32: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... +50: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... +50: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... +46: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... +46: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... +46: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... +46: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... +49: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... +49: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... +49: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... +49: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... +41: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... +41: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... +41: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... +55: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... +55: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... +55: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... +55: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... +37: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... +21: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... +33: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... +33: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... +33: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... +33: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... +52: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... +52: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... +52: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... +52: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... +61: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... +61: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... +61: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... +61: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... +45: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... +45: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... +45: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... +45: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... +47: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... + 9: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... +35: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... +35: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... +35: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... +35: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... +19: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... +48: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... +48: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... +53: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... +53: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... +40: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... +30: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... +30: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... + 0: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... + 0: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... +54: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... +54: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... +54: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... +56: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... +56: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... +62: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... + 4: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +51: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... +51: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... +51: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... +63: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... +17: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... +17: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... +17: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... + 1: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +39: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... + 7: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +11: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... +11: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +57: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... +27: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... +43: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... +43: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... +25: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... +13: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +13: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... +13: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +14: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... +38: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... +38: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... +22: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... +22: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... +22: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... +36: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... +36: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... +36: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... +24: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... +24: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... +24: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... +24: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... +42: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... +58: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... +58: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... +58: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... +44: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... +44: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... +44: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... +44: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... +34: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... +28: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... +28: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... +28: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... +28: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... +32: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... +50: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... +50: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... +46: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... +46: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... +46: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... +46: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... +26: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... +26: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... +26: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... +26: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... + 5: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... + 5: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +20: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... +20: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... +20: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... +20: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... +41: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... +41: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... +41: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... +55: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... +55: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... +10: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +10: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... +10: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +37: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... +37: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... + 8: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... + 8: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +18: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... +18: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... +18: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... +18: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... +18: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... +21: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... +21: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... +21: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... +21: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... +33: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... +33: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... +33: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... +33: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... +52: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... +52: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... +52: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... + 6: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... + 6: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... + 6: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +31: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... +31: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... +31: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... +31: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... +61: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... +61: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... +61: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... +61: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... +15: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +15: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +15: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +15: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +45: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... +45: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... +47: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... +47: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... +47: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... + 3: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... + 3: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... + 3: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... + 9: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... + 9: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... + 9: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +59: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... +59: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... +59: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... +59: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... +59: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... +12: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... +12: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... +12: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... + 2: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... + 2: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... + 2: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... + 2: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +29: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... +29: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... +35: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... +19: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... +48: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... +53: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... +53: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... +40: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... +40: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... +40: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... +30: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... +23: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... +23: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... +16: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... +16: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... +16: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... + 0: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... + 0: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... +56: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... +56: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... + 4: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... + 4: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... + 4: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +51: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... +17: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... + 1: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... + 7: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... + 7: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... + 7: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... +11: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +11: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +11: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... +57: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... +57: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... +57: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... +27: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... +43: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... +25: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... +25: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... +25: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... +13: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... +13: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... +14: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +38: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... +38: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... +38: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... +22: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... +24: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... +24: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... +42: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... +42: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... +42: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... +28: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... +32: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... +26: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... +26: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... + 5: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... + 5: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +20: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... +41: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... +41: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... +55: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... +55: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... +10: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... +10: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +37: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... + 8: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... + 8: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +18: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... +21: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... +52: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... + 6: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... + 6: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... +31: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... +15: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... +45: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... +45: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... + 3: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... + 3: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... + 9: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +59: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... +59: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... +59: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... +12: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... +12: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... + 2: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... +29: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... +35: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... +35: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... +19: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... +19: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... +48: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... +30: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... +30: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... +23: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... +23: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... +16: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... +16: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... +16: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... +16: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... + 0: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... + 4: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... + 4: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... +17: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... + 1: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... + 1: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... + 1: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... + 7: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... + 7: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +11: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... +11: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +11: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... +57: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... +27: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... +43: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... +25: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... +13: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +14: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +14: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +22: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... +24: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... +28: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... +32: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... +32: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... +32: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... +26: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... + 5: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +20: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... +10: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... + 8: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... + 8: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... +18: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... +21: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... + 6: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... +31: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... +31: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... +15: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... + 3: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... + 9: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... + 9: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... +12: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +12: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... + 2: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... +29: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... +29: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... +35: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... +19: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... +19: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... +30: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... +23: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... +23: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... +16: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... + 0: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... + 4: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... +17: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... + 1: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... + 1: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... + 7: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... +27: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... +25: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... +13: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +14: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +14: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... +14: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... +22: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... +24: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... +28: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... +28: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... +26: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... + 5: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... +20: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... +20: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... +10: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... + 8: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... +18: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... +21: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... + 6: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... +31: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... +15: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... + 3: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... + 9: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... +12: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... + 2: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... +29: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... +23: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... + 0: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... + 4: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... +17: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... + 1: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +27: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... +25: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... +25: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... +14: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... +22: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... + 5: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... +10: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... + 8: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... + 6: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... +15: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... + 3: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... +19: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... +30: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... +23: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... +17: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... +27: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... +22: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... + 5: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... + 2: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +29: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... +19: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... +30: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... +29: [2022-12-02 00:51:40,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... + 0: [2022-12-02 00:51:41,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. + 0: [2022-12-02 00:51:41,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. + 0: [2022-12-02 00:51:41,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt + 0: [2022-12-02 00:51:41,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt + 0: [2022-12-02 00:51:41,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 0: [2022-12-02 00:51:41,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +32: [2022-12-02 00:51:41,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. +32: [2022-12-02 00:51:41,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt +32: [2022-12-02 00:51:41,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 0: [2022-12-02 00:51:41,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. + 0: [2022-12-02 00:51:41,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. + 0: [2022-12-02 00:51:41,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt + 0: [2022-12-02 00:51:41,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 0: [2022-12-02 00:51:41,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. + 0: [2022-12-02 00:51:41,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. + 0: [2022-12-02 00:51:41,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt + 0: [2022-12-02 00:51:41,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt + 0: [2022-12-02 00:51:41,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 0: [2022-12-02 00:51:41,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. + 0: [2022-12-02 00:51:41,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. + 0: [2022-12-02 00:51:41,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 0: [2022-12-02 00:51:41,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt + 0: [2022-12-02 00:51:41,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt + 0: [2022-12-02 00:51:41,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 0: [2022-12-02 00:51:41,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +32: [2022-12-02 00:51:41,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. +32: [2022-12-02 00:51:41,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt +32: [2022-12-02 00:51:41,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +32: [2022-12-02 00:51:41,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. +32: [2022-12-02 00:51:41,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt +32: [2022-12-02 00:51:41,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +59: [2022-12-02 00:51:41,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. +32: [2022-12-02 00:51:41,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. +32: [2022-12-02 00:51:41,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt +32: [2022-12-02 00:51:41,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +32: [2022-12-02 00:51:41,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. +32: [2022-12-02 00:51:41,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt +32: [2022-12-02 00:51:41,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +32: [2022-12-02 00:51:41,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. +32: [2022-12-02 00:51:41,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt +32: [2022-12-02 00:51:41,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +32: [2022-12-02 00:51:41,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. +32: [2022-12-02 00:51:41,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt +32: [2022-12-02 00:51:41,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +59: [2022-12-02 00:51:41,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt +59: [2022-12-02 00:51:41,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +59: [2022-12-02 00:51:41,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. +59: [2022-12-02 00:51:41,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt +59: [2022-12-02 00:51:41,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +59: [2022-12-02 00:51:41,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. +59: [2022-12-02 00:51:41,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt +59: [2022-12-02 00:51:41,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +59: [2022-12-02 00:51:41,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. +59: [2022-12-02 00:51:41,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt +59: [2022-12-02 00:51:41,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +59: [2022-12-02 00:51:41,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. +59: [2022-12-02 00:51:41,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt +59: [2022-12-02 00:51:41,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +59: [2022-12-02 00:51:41,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. +59: [2022-12-02 00:51:41,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt +59: [2022-12-02 00:51:41,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. +59: [2022-12-02 00:51:41,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +59: [2022-12-02 00:51:41,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt +59: [2022-12-02 00:51:41,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. +38: [2022-12-02 00:51:41,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt +38: [2022-12-02 00:51:41,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. +38: [2022-12-02 00:51:41,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt +38: [2022-12-02 00:51:41,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +32: [2022-12-02 00:51:41,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. +32: [2022-12-02 00:51:41,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt +32: [2022-12-02 00:51:41,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +24: [2022-12-02 00:51:41,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. +24: [2022-12-02 00:51:41,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt +24: [2022-12-02 00:51:41,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. +38: [2022-12-02 00:51:41,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt +38: [2022-12-02 00:51:41,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. +38: [2022-12-02 00:51:41,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt +38: [2022-12-02 00:51:41,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +57: [2022-12-02 00:51:41,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. +57: [2022-12-02 00:51:41,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt +57: [2022-12-02 00:51:41,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +57: [2022-12-02 00:51:41,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. +57: [2022-12-02 00:51:41,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt +57: [2022-12-02 00:51:41,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. +16: [2022-12-02 00:51:41,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. +16: [2022-12-02 00:51:41,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. +16: [2022-12-02 00:51:41,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt +16: [2022-12-02 00:51:41,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +16: [2022-12-02 00:51:41,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt +16: [2022-12-02 00:51:41,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +13: [2022-12-02 00:51:41,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +13: [2022-12-02 00:51:41,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +13: [2022-12-02 00:51:41,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +13: [2022-12-02 00:51:41,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +13: [2022-12-02 00:51:41,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +13: [2022-12-02 00:51:41,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +16: [2022-12-02 00:51:41,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. +16: [2022-12-02 00:51:41,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt +16: [2022-12-02 00:51:41,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +16: [2022-12-02 00:51:41,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. +16: [2022-12-02 00:51:41,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt +16: [2022-12-02 00:51:41,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +31: [2022-12-02 00:51:41,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. +31: [2022-12-02 00:51:41,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. +31: [2022-12-02 00:51:41,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. +31: [2022-12-02 00:51:41,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt +31: [2022-12-02 00:51:41,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt +31: [2022-12-02 00:51:41,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +31: [2022-12-02 00:51:41,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt +31: [2022-12-02 00:51:41,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +31: [2022-12-02 00:51:41,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +18: [2022-12-02 00:51:41,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. +18: [2022-12-02 00:51:41,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. +18: [2022-12-02 00:51:41,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. +18: [2022-12-02 00:51:41,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt +18: [2022-12-02 00:51:41,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt +18: [2022-12-02 00:51:41,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt +18: [2022-12-02 00:51:41,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +11: [2022-12-02 00:51:41,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. + 5: [2022-12-02 00:51:41,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. +18: [2022-12-02 00:51:41,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +18: [2022-12-02 00:51:41,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +11: [2022-12-02 00:51:41,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt + 5: [2022-12-02 00:51:41,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +11: [2022-12-02 00:51:41,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 5: [2022-12-02 00:51:41,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt + 5: [2022-12-02 00:51:41,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +11: [2022-12-02 00:51:41,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. + 5: [2022-12-02 00:51:41,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +11: [2022-12-02 00:51:41,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. + 5: [2022-12-02 00:51:41,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +11: [2022-12-02 00:51:41,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt + 5: [2022-12-02 00:51:41,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. +11: [2022-12-02 00:51:41,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt +11: [2022-12-02 00:51:41,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 5: [2022-12-02 00:51:41,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt +31: [2022-12-02 00:51:41,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. +11: [2022-12-02 00:51:41,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 5: [2022-12-02 00:51:41,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +31: [2022-12-02 00:51:41,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt +31: [2022-12-02 00:51:41,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +31: [2022-12-02 00:51:41,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. +31: [2022-12-02 00:51:41,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt +31: [2022-12-02 00:51:41,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +57: [2022-12-02 00:51:41,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. +57: [2022-12-02 00:51:41,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt +57: [2022-12-02 00:51:41,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +44: [2022-12-02 00:51:41,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. +29: [2022-12-02 00:51:41,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. +44: [2022-12-02 00:51:41,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. +44: [2022-12-02 00:51:41,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +34: [2022-12-02 00:51:41,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt +34: [2022-12-02 00:51:41,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt +34: [2022-12-02 00:51:41,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt +34: [2022-12-02 00:51:41,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt +34: [2022-12-02 00:51:41,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +34: [2022-12-02 00:51:41,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +27: [2022-12-02 00:51:41,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. +27: [2022-12-02 00:51:41,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt +27: [2022-12-02 00:51:41,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +27: [2022-12-02 00:51:41,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. +27: [2022-12-02 00:51:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt +27: [2022-12-02 00:51:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +27: [2022-12-02 00:51:41,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. +27: [2022-12-02 00:51:41,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt +27: [2022-12-02 00:51:41,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +27: [2022-12-02 00:51:41,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. +27: [2022-12-02 00:51:41,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt +27: [2022-12-02 00:51:41,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +57: [2022-12-02 00:51:41,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. +57: [2022-12-02 00:51:41,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt +57: [2022-12-02 00:51:41,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 5: [2022-12-02 00:51:41,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. + 5: [2022-12-02 00:51:41,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt + 5: [2022-12-02 00:51:41,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 5: [2022-12-02 00:51:41,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. + 5: [2022-12-02 00:51:41,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt + 5: [2022-12-02 00:51:41,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. + 3: [2022-12-02 00:51:41,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 3: [2022-12-02 00:51:41,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt + 3: [2022-12-02 00:51:41,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt + 3: [2022-12-02 00:51:41,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt + 3: [2022-12-02 00:51:41,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 3: [2022-12-02 00:51:41,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt +22: [2022-12-02 00:51:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. +22: [2022-12-02 00:51:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. +22: [2022-12-02 00:51:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. +44: [2022-12-02 00:51:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. +39: [2022-12-02 00:51:41,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. +22: [2022-12-02 00:51:41,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt +44: [2022-12-02 00:51:41,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt +39: [2022-12-02 00:51:41,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt +22: [2022-12-02 00:51:41,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt +22: [2022-12-02 00:51:41,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt +44: [2022-12-02 00:51:41,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +22: [2022-12-02 00:51:41,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +22: [2022-12-02 00:51:41,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +44: [2022-12-02 00:51:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. +34: [2022-12-02 00:51:41,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. +39: [2022-12-02 00:51:41,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. +22: [2022-12-02 00:51:41,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +44: [2022-12-02 00:51:41,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt +39: [2022-12-02 00:51:41,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt +44: [2022-12-02 00:51:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. +39: [2022-12-02 00:51:41,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt +39: [2022-12-02 00:51:41,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. +39: [2022-12-02 00:51:41,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt +39: [2022-12-02 00:51:41,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. +34: [2022-12-02 00:51:41,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt +29: [2022-12-02 00:51:41,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt +34: [2022-12-02 00:51:41,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +29: [2022-12-02 00:51:41,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +29: [2022-12-02 00:51:41,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. +29: [2022-12-02 00:51:41,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt +29: [2022-12-02 00:51:41,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +29: [2022-12-02 00:51:41,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. +29: [2022-12-02 00:51:41,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt +29: [2022-12-02 00:51:41,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +29: [2022-12-02 00:51:41,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. +29: [2022-12-02 00:51:41,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt +29: [2022-12-02 00:51:41,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +34: [2022-12-02 00:51:41,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. +34: [2022-12-02 00:51:41,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt +34: [2022-12-02 00:51:41,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +34: [2022-12-02 00:51:41,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. +34: [2022-12-02 00:51:41,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt +34: [2022-12-02 00:51:41,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt +38: [2022-12-02 00:51:41,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. +38: [2022-12-02 00:51:41,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt +38: [2022-12-02 00:51:41,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +27: [2022-12-02 00:51:41,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. +27: [2022-12-02 00:51:41,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. +27: [2022-12-02 00:51:41,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. +27: [2022-12-02 00:51:41,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt +27: [2022-12-02 00:51:41,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt +27: [2022-12-02 00:51:41,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt +27: [2022-12-02 00:51:41,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +27: [2022-12-02 00:51:41,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +27: [2022-12-02 00:51:41,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +43: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. +43: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. +43: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. +43: [2022-12-02 00:51:41,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt +43: [2022-12-02 00:51:41,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt +43: [2022-12-02 00:51:41,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt +43: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +43: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +43: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +52: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. +52: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. +52: [2022-12-02 00:51:41,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt +52: [2022-12-02 00:51:41,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt +52: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +52: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +47: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. +47: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. +47: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. +47: [2022-12-02 00:51:41,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt +47: [2022-12-02 00:51:41,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt +47: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +47: [2022-12-02 00:51:41,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt +47: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +47: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +44: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. +28: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. + 9: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. + 9: [2022-12-02 00:51:41,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt + 9: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. +28: [2022-12-02 00:51:41,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt + 9: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 9: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. + 9: [2022-12-02 00:51:41,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt +28: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 9: [2022-12-02 00:51:41,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt +28: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. + 9: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +28: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. + 9: [2022-12-02 00:51:41,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +28: [2022-12-02 00:51:41,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt + 9: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. + 9: [2022-12-02 00:51:41,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt + 9: [2022-12-02 00:51:41,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +28: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +28: [2022-12-02 00:51:41,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt +28: [2022-12-02 00:51:41,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +44: [2022-12-02 00:51:41,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt +44: [2022-12-02 00:51:41,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +44: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. +44: [2022-12-02 00:51:41,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt +44: [2022-12-02 00:51:41,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +57: [2022-12-02 00:51:41,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. +57: [2022-12-02 00:51:41,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt +57: [2022-12-02 00:51:41,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +43: [2022-12-02 00:51:41,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. +43: [2022-12-02 00:51:41,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. +43: [2022-12-02 00:51:41,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt +43: [2022-12-02 00:51:41,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt +43: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +43: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +43: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. +43: [2022-12-02 00:51:41,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt +43: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +13: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. +13: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. +13: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. +13: [2022-12-02 00:51:41,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt +13: [2022-12-02 00:51:41,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt +13: [2022-12-02 00:51:41,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt +13: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +13: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +13: [2022-12-02 00:51:41,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +16: [2022-12-02 00:51:41,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. +16: [2022-12-02 00:51:41,259] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt +16: [2022-12-02 00:51:41,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 9: [2022-12-02 00:51:41,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +52: [2022-12-02 00:51:41,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. +52: [2022-12-02 00:51:41,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. +52: [2022-12-02 00:51:41,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt +52: [2022-12-02 00:51:41,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt +52: [2022-12-02 00:51:41,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +52: [2022-12-02 00:51:41,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 9: [2022-12-02 00:51:41,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt + 9: [2022-12-02 00:51:41,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +61: [2022-12-02 00:51:41,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. +61: [2022-12-02 00:51:41,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. +61: [2022-12-02 00:51:41,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. +28: [2022-12-02 00:51:41,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. +29: [2022-12-02 00:51:41,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. +28: [2022-12-02 00:51:41,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt +28: [2022-12-02 00:51:41,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +21: [2022-12-02 00:51:41,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. +21: [2022-12-02 00:51:41,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. +21: [2022-12-02 00:51:41,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. +21: [2022-12-02 00:51:41,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. +21: [2022-12-02 00:51:41,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt +21: [2022-12-02 00:51:41,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt +21: [2022-12-02 00:51:41,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt +21: [2022-12-02 00:51:41,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +21: [2022-12-02 00:51:41,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt +21: [2022-12-02 00:51:41,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +21: [2022-12-02 00:51:41,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +21: [2022-12-02 00:51:41,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +57: [2022-12-02 00:51:41,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. +57: [2022-12-02 00:51:41,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt +57: [2022-12-02 00:51:41,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +29: [2022-12-02 00:51:41,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt +29: [2022-12-02 00:51:41,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +18: [2022-12-02 00:51:41,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. +52: [2022-12-02 00:51:41,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. +52: [2022-12-02 00:51:41,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt +52: [2022-12-02 00:51:41,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +18: [2022-12-02 00:51:41,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt +18: [2022-12-02 00:51:41,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +47: [2022-12-02 00:51:41,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. +47: [2022-12-02 00:51:41,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt +47: [2022-12-02 00:51:41,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. +47: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. +19: [2022-12-02 00:51:41,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt +19: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. +22: [2022-12-02 00:51:41,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. +19: [2022-12-02 00:51:41,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt +19: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +22: [2022-12-02 00:51:41,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt +22: [2022-12-02 00:51:41,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +22: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. +22: [2022-12-02 00:51:41,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt +22: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +47: [2022-12-02 00:51:41,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt +47: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +47: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. +47: [2022-12-02 00:51:41,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt +47: [2022-12-02 00:51:41,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +52: [2022-12-02 00:51:41,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. +52: [2022-12-02 00:51:41,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt +52: [2022-12-02 00:51:41,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +24: [2022-12-02 00:51:41,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. +24: [2022-12-02 00:51:41,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt +24: [2022-12-02 00:51:41,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. + 1: [2022-12-02 00:51:41,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. + 1: [2022-12-02 00:51:41,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. + 1: [2022-12-02 00:51:41,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. + 1: [2022-12-02 00:51:41,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. + 1: [2022-12-02 00:51:41,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt + 1: [2022-12-02 00:51:41,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt + 1: [2022-12-02 00:51:41,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt + 1: [2022-12-02 00:51:41,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt + 1: [2022-12-02 00:51:41,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 1: [2022-12-02 00:51:41,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 1: [2022-12-02 00:51:41,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 1: [2022-12-02 00:51:41,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +23: [2022-12-02 00:51:41,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. +23: [2022-12-02 00:51:41,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. +23: [2022-12-02 00:51:41,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt +23: [2022-12-02 00:51:41,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt +23: [2022-12-02 00:51:41,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +23: [2022-12-02 00:51:41,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. +17: [2022-12-02 00:51:41,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt +17: [2022-12-02 00:51:41,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt +17: [2022-12-02 00:51:41,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +17: [2022-12-02 00:51:41,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt +17: [2022-12-02 00:51:41,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +17: [2022-12-02 00:51:41,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +56: [2022-12-02 00:51:41,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. +56: [2022-12-02 00:51:41,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. +56: [2022-12-02 00:51:41,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. +56: [2022-12-02 00:51:41,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt +56: [2022-12-02 00:51:41,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt +56: [2022-12-02 00:51:41,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +56: [2022-12-02 00:51:41,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt +56: [2022-12-02 00:51:41,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +56: [2022-12-02 00:51:41,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +11: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. +11: [2022-12-02 00:51:41,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt +11: [2022-12-02 00:51:41,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +28: [2022-12-02 00:51:41,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. +28: [2022-12-02 00:51:41,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt +28: [2022-12-02 00:51:41,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. +23: [2022-12-02 00:51:41,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. +23: [2022-12-02 00:51:41,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. +23: [2022-12-02 00:51:41,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. +23: [2022-12-02 00:51:41,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt +23: [2022-12-02 00:51:41,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt +23: [2022-12-02 00:51:41,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt +23: [2022-12-02 00:51:41,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +23: [2022-12-02 00:51:41,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +23: [2022-12-02 00:51:41,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +56: [2022-12-02 00:51:41,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. +61: [2022-12-02 00:51:41,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt +61: [2022-12-02 00:51:41,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt +25: [2022-12-02 00:51:41,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt +25: [2022-12-02 00:51:41,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt +61: [2022-12-02 00:51:41,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt +25: [2022-12-02 00:51:41,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt +61: [2022-12-02 00:51:41,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +61: [2022-12-02 00:51:41,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +61: [2022-12-02 00:51:41,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +25: [2022-12-02 00:51:41,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt +25: [2022-12-02 00:51:41,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +25: [2022-12-02 00:51:41,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +56: [2022-12-02 00:51:41,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt +56: [2022-12-02 00:51:41,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +56: [2022-12-02 00:51:41,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. +56: [2022-12-02 00:51:41,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. +56: [2022-12-02 00:51:41,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt +56: [2022-12-02 00:51:41,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. +56: [2022-12-02 00:51:41,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +56: [2022-12-02 00:51:41,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt +56: [2022-12-02 00:51:41,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt +56: [2022-12-02 00:51:41,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +56: [2022-12-02 00:51:41,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +61: [2022-12-02 00:51:41,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. +61: [2022-12-02 00:51:41,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt +61: [2022-12-02 00:51:41,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. +61: [2022-12-02 00:51:41,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +61: [2022-12-02 00:51:41,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt +61: [2022-12-02 00:51:41,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. +61: [2022-12-02 00:51:41,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +61: [2022-12-02 00:51:41,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt +61: [2022-12-02 00:51:41,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +61: [2022-12-02 00:51:41,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. +61: [2022-12-02 00:51:41,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt +61: [2022-12-02 00:51:41,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +59: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. +10: [2022-12-02 00:51:41,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. +10: [2022-12-02 00:51:41,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +10: [2022-12-02 00:51:41,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. +10: [2022-12-02 00:51:41,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. +10: [2022-12-02 00:51:41,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt +10: [2022-12-02 00:51:41,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +10: [2022-12-02 00:51:41,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt +10: [2022-12-02 00:51:41,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +10: [2022-12-02 00:51:41,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +10: [2022-12-02 00:51:41,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt +10: [2022-12-02 00:51:41,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +10: [2022-12-02 00:51:41,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. +42: [2022-12-02 00:51:41,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. +42: [2022-12-02 00:51:41,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. +42: [2022-12-02 00:51:41,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. +42: [2022-12-02 00:51:41,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. +42: [2022-12-02 00:51:41,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt +42: [2022-12-02 00:51:41,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt +42: [2022-12-02 00:51:41,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt +42: [2022-12-02 00:51:41,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt +42: [2022-12-02 00:51:41,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +42: [2022-12-02 00:51:41,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +42: [2022-12-02 00:51:41,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +42: [2022-12-02 00:51:41,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. + 8: [2022-12-02 00:51:41,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. + 8: [2022-12-02 00:51:41,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt + 8: [2022-12-02 00:51:41,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt + 8: [2022-12-02 00:51:41,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 8: [2022-12-02 00:51:41,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 8: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt +38: [2022-12-02 00:51:41,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. +59: [2022-12-02 00:51:41,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt +59: [2022-12-02 00:51:41,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. +19: [2022-12-02 00:51:41,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. +36: [2022-12-02 00:51:41,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt +36: [2022-12-02 00:51:41,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt +36: [2022-12-02 00:51:41,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt +36: [2022-12-02 00:51:41,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt +36: [2022-12-02 00:51:41,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt +36: [2022-12-02 00:51:41,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt +36: [2022-12-02 00:51:41,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt +36: [2022-12-02 00:51:41,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +24: [2022-12-02 00:51:41,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +36: [2022-12-02 00:51:41,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. +19: [2022-12-02 00:51:41,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt +19: [2022-12-02 00:51:41,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt +19: [2022-12-02 00:51:41,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt +19: [2022-12-02 00:51:41,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. +24: [2022-12-02 00:51:41,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt +24: [2022-12-02 00:51:41,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +63: [2022-12-02 00:51:41,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. +63: [2022-12-02 00:51:41,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. +63: [2022-12-02 00:51:41,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. +63: [2022-12-02 00:51:41,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. +63: [2022-12-02 00:51:41,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. + 6: [2022-12-02 00:51:41,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt + 6: [2022-12-02 00:51:41,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt + 6: [2022-12-02 00:51:41,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt + 6: [2022-12-02 00:51:41,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt + 6: [2022-12-02 00:51:41,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 6: [2022-12-02 00:51:41,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. +63: [2022-12-02 00:51:41,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. +63: [2022-12-02 00:51:41,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt +63: [2022-12-02 00:51:41,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt +63: [2022-12-02 00:51:41,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt +63: [2022-12-02 00:51:41,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt +63: [2022-12-02 00:51:41,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt +63: [2022-12-02 00:51:41,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt +63: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +63: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +63: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +63: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +63: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +63: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt +62: [2022-12-02 00:51:41,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. +62: [2022-12-02 00:51:41,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. +62: [2022-12-02 00:51:41,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt +62: [2022-12-02 00:51:41,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt +62: [2022-12-02 00:51:41,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. +12: [2022-12-02 00:51:41,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt +12: [2022-12-02 00:51:41,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +12: [2022-12-02 00:51:41,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +12: [2022-12-02 00:51:41,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt +12: [2022-12-02 00:51:41,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +12: [2022-12-02 00:51:41,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +50: [2022-12-02 00:51:41,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. +50: [2022-12-02 00:51:41,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. +50: [2022-12-02 00:51:41,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. +50: [2022-12-02 00:51:41,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt +50: [2022-12-02 00:51:41,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt +50: [2022-12-02 00:51:41,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt +50: [2022-12-02 00:51:41,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +50: [2022-12-02 00:51:41,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +50: [2022-12-02 00:51:41,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +15: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +15: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +15: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt +15: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt +15: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +15: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +50: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. +50: [2022-12-02 00:51:41,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt +50: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +50: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. +50: [2022-12-02 00:51:41,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt +50: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +50: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. +50: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. +50: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt +50: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt +50: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +50: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. +42: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. +14: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. +14: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. +14: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +14: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +14: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. +14: [2022-12-02 00:51:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt +14: [2022-12-02 00:51:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt +14: [2022-12-02 00:51:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +14: [2022-12-02 00:51:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +14: [2022-12-02 00:51:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt +14: [2022-12-02 00:51:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +14: [2022-12-02 00:51:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +14: [2022-12-02 00:51:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +14: [2022-12-02 00:51:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +14: [2022-12-02 00:51:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +53: [2022-12-02 00:51:41,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt +53: [2022-12-02 00:51:41,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt +53: [2022-12-02 00:51:41,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. +53: [2022-12-02 00:51:41,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt +53: [2022-12-02 00:51:41,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +53: [2022-12-02 00:51:41,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. +52: [2022-12-02 00:51:41,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. +52: [2022-12-02 00:51:41,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt +52: [2022-12-02 00:51:41,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +50: [2022-12-02 00:51:41,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. +50: [2022-12-02 00:51:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt +50: [2022-12-02 00:51:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. +19: [2022-12-02 00:51:41,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt +19: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt +19: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +61: [2022-12-02 00:51:41,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. +24: [2022-12-02 00:51:41,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. +42: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt +61: [2022-12-02 00:51:41,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt +42: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +61: [2022-12-02 00:51:41,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt +62: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. +62: [2022-12-02 00:51:41,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt +62: [2022-12-02 00:51:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. +62: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt +62: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. +62: [2022-12-02 00:51:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt +62: [2022-12-02 00:51:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +24: [2022-12-02 00:51:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt +24: [2022-12-02 00:51:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. +30: [2022-12-02 00:51:41,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt +30: [2022-12-02 00:51:41,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +30: [2022-12-02 00:51:41,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +30: [2022-12-02 00:51:41,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. +30: [2022-12-02 00:51:41,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt +30: [2022-12-02 00:51:41,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 4: [2022-12-02 00:51:41,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. + 4: [2022-12-02 00:51:41,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. + 4: [2022-12-02 00:51:41,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. + 4: [2022-12-02 00:51:41,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. + 4: [2022-12-02 00:51:41,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. + 4: [2022-12-02 00:51:41,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt + 4: [2022-12-02 00:51:41,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt + 4: [2022-12-02 00:51:41,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 4: [2022-12-02 00:51:41,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 4: [2022-12-02 00:51:41,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt + 4: [2022-12-02 00:51:41,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt + 4: [2022-12-02 00:51:41,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt + 4: [2022-12-02 00:51:41,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 4: [2022-12-02 00:51:41,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 4: [2022-12-02 00:51:41,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +47: [2022-12-02 00:51:41,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. +47: [2022-12-02 00:51:41,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt +47: [2022-12-02 00:51:41,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. +37: [2022-12-02 00:51:41,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. +37: [2022-12-02 00:51:41,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +37: [2022-12-02 00:51:41,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt +37: [2022-12-02 00:51:41,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +37: [2022-12-02 00:51:41,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +37: [2022-12-02 00:51:41,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 7: [2022-12-02 00:51:41,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. + 7: [2022-12-02 00:51:41,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. + 7: [2022-12-02 00:51:41,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt + 7: [2022-12-02 00:51:41,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt + 7: [2022-12-02 00:51:41,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 7: [2022-12-02 00:51:41,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. + 7: [2022-12-02 00:51:41,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 7: [2022-12-02 00:51:41,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt + 7: [2022-12-02 00:51:41,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 7: [2022-12-02 00:51:41,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. + 7: [2022-12-02 00:51:41,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt + 7: [2022-12-02 00:51:41,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. + 7: [2022-12-02 00:51:41,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 7: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt + 7: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 7: [2022-12-02 00:51:41,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. + 7: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt + 7: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +53: [2022-12-02 00:51:41,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. +53: [2022-12-02 00:51:41,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt +53: [2022-12-02 00:51:41,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +10: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. +10: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt +10: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +54: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. +54: [2022-12-02 00:51:41,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt +54: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. +54: [2022-12-02 00:51:41,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. +54: [2022-12-02 00:51:41,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt +54: [2022-12-02 00:51:41,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. +48: [2022-12-02 00:51:41,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt +48: [2022-12-02 00:51:41,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt +48: [2022-12-02 00:51:41,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt +48: [2022-12-02 00:51:41,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt +48: [2022-12-02 00:51:41,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +48: [2022-12-02 00:51:41,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +48: [2022-12-02 00:51:41,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +54: [2022-12-02 00:51:41,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. +54: [2022-12-02 00:51:41,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt +54: [2022-12-02 00:51:41,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +54: [2022-12-02 00:51:41,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. +54: [2022-12-02 00:51:41,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt +54: [2022-12-02 00:51:41,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +44: [2022-12-02 00:51:41,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. +44: [2022-12-02 00:51:41,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt +44: [2022-12-02 00:51:41,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +49: [2022-12-02 00:51:41,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. +49: [2022-12-02 00:51:41,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. +49: [2022-12-02 00:51:41,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. +49: [2022-12-02 00:51:41,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. +49: [2022-12-02 00:51:41,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. +49: [2022-12-02 00:51:41,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. +49: [2022-12-02 00:51:41,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. +49: [2022-12-02 00:51:41,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt +49: [2022-12-02 00:51:41,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt +49: [2022-12-02 00:51:41,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt +49: [2022-12-02 00:51:41,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt +49: [2022-12-02 00:51:41,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt +49: [2022-12-02 00:51:41,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +49: [2022-12-02 00:51:41,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt +49: [2022-12-02 00:51:41,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt +49: [2022-12-02 00:51:41,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +49: [2022-12-02 00:51:41,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +49: [2022-12-02 00:51:41,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +49: [2022-12-02 00:51:41,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +49: [2022-12-02 00:51:41,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +49: [2022-12-02 00:51:41,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. +41: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt +41: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt +41: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt +41: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt +41: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt +41: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt +41: [2022-12-02 00:51:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +41: [2022-12-02 00:51:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +33: [2022-12-02 00:51:41,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. +33: [2022-12-02 00:51:41,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. +33: [2022-12-02 00:51:41,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. +33: [2022-12-02 00:51:41,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. +33: [2022-12-02 00:51:41,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. +33: [2022-12-02 00:51:41,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt +33: [2022-12-02 00:51:41,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt +33: [2022-12-02 00:51:41,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt +33: [2022-12-02 00:51:41,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt +33: [2022-12-02 00:51:41,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt +33: [2022-12-02 00:51:41,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +33: [2022-12-02 00:51:41,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +33: [2022-12-02 00:51:41,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +33: [2022-12-02 00:51:41,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +33: [2022-12-02 00:51:41,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +33: [2022-12-02 00:51:41,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. +33: [2022-12-02 00:51:41,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt +33: [2022-12-02 00:51:41,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +54: [2022-12-02 00:51:41,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. +54: [2022-12-02 00:51:41,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. +54: [2022-12-02 00:51:41,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. +54: [2022-12-02 00:51:41,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt +54: [2022-12-02 00:51:41,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt +54: [2022-12-02 00:51:41,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt +54: [2022-12-02 00:51:41,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +54: [2022-12-02 00:51:41,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +54: [2022-12-02 00:51:41,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +42: [2022-12-02 00:51:41,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. +42: [2022-12-02 00:51:41,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt +42: [2022-12-02 00:51:41,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. + 2: [2022-12-02 00:51:41,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt + 2: [2022-12-02 00:51:41,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt + 2: [2022-12-02 00:51:41,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt + 2: [2022-12-02 00:51:41,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt + 2: [2022-12-02 00:51:41,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 2: [2022-12-02 00:51:41,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 2: [2022-12-02 00:51:41,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +24: [2022-12-02 00:51:41,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. +24: [2022-12-02 00:51:41,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt +24: [2022-12-02 00:51:41,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +43: [2022-12-02 00:51:41,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. +43: [2022-12-02 00:51:41,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt +43: [2022-12-02 00:51:41,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +45: [2022-12-02 00:51:41,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. +45: [2022-12-02 00:51:41,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. +45: [2022-12-02 00:51:41,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. +45: [2022-12-02 00:51:41,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt +45: [2022-12-02 00:51:41,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. +45: [2022-12-02 00:51:41,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. +45: [2022-12-02 00:51:41,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt +45: [2022-12-02 00:51:41,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt +45: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +45: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +45: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +45: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt +45: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt +45: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +45: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. +45: [2022-12-02 00:51:41,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. +51: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt +51: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt +51: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt +51: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt +51: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt +45: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt +51: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt +45: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. +51: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt +51: [2022-12-02 00:51:41,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +51: [2022-12-02 00:51:41,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. +55: [2022-12-02 00:51:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt +55: [2022-12-02 00:51:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt +55: [2022-12-02 00:51:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt +55: [2022-12-02 00:51:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt +55: [2022-12-02 00:51:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. +55: [2022-12-02 00:51:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +55: [2022-12-02 00:51:41,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. +55: [2022-12-02 00:51:41,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt +55: [2022-12-02 00:51:41,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +55: [2022-12-02 00:51:41,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. +55: [2022-12-02 00:51:41,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt +55: [2022-12-02 00:51:41,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +28: [2022-12-02 00:51:41,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. +28: [2022-12-02 00:51:41,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt +28: [2022-12-02 00:51:41,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +11: [2022-12-02 00:51:41,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +11: [2022-12-02 00:51:41,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +11: [2022-12-02 00:51:41,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +18: [2022-12-02 00:51:41,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. +18: [2022-12-02 00:51:41,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt +18: [2022-12-02 00:51:41,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. +60: [2022-12-02 00:51:41,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt +60: [2022-12-02 00:51:41,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +60: [2022-12-02 00:51:41,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt +60: [2022-12-02 00:51:41,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt +60: [2022-12-02 00:51:41,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt +60: [2022-12-02 00:51:41,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +41: [2022-12-02 00:51:41,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. +41: [2022-12-02 00:51:41,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt +41: [2022-12-02 00:51:41,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +60: [2022-12-02 00:51:41,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. +60: [2022-12-02 00:51:41,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt +60: [2022-12-02 00:51:41,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +58: [2022-12-02 00:51:41,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. +58: [2022-12-02 00:51:41,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt +58: [2022-12-02 00:51:41,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +58: [2022-12-02 00:51:41,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. +58: [2022-12-02 00:51:41,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt +58: [2022-12-02 00:51:41,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +58: [2022-12-02 00:51:41,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. +58: [2022-12-02 00:51:41,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt +58: [2022-12-02 00:51:41,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +58: [2022-12-02 00:51:41,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. +58: [2022-12-02 00:51:41,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt +58: [2022-12-02 00:51:41,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +58: [2022-12-02 00:51:41,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. +58: [2022-12-02 00:51:41,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt +58: [2022-12-02 00:51:41,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +58: [2022-12-02 00:51:41,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. +58: [2022-12-02 00:51:41,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt +58: [2022-12-02 00:51:41,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +58: [2022-12-02 00:51:41,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. +58: [2022-12-02 00:51:41,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. +58: [2022-12-02 00:51:41,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt +58: [2022-12-02 00:51:41,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +58: [2022-12-02 00:51:41,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt +58: [2022-12-02 00:51:41,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +21: [2022-12-02 00:51:41,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. +21: [2022-12-02 00:51:41,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt +21: [2022-12-02 00:51:41,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +46: [2022-12-02 00:51:41,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. +46: [2022-12-02 00:51:41,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. +46: [2022-12-02 00:51:41,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. +46: [2022-12-02 00:51:41,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. +46: [2022-12-02 00:51:41,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt +46: [2022-12-02 00:51:41,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt +46: [2022-12-02 00:51:41,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt +46: [2022-12-02 00:51:41,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt +46: [2022-12-02 00:51:41,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +46: [2022-12-02 00:51:41,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +46: [2022-12-02 00:51:41,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +46: [2022-12-02 00:51:41,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +43: [2022-12-02 00:51:41,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. +46: [2022-12-02 00:51:41,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. +46: [2022-12-02 00:51:41,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. +43: [2022-12-02 00:51:41,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt +46: [2022-12-02 00:51:41,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. +46: [2022-12-02 00:51:41,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt +46: [2022-12-02 00:51:41,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt +43: [2022-12-02 00:51:41,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +46: [2022-12-02 00:51:41,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt +46: [2022-12-02 00:51:41,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +46: [2022-12-02 00:51:41,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +46: [2022-12-02 00:51:41,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +46: [2022-12-02 00:51:41,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. +46: [2022-12-02 00:51:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt +46: [2022-12-02 00:51:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. +26: [2022-12-02 00:51:41,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt +26: [2022-12-02 00:51:41,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt +26: [2022-12-02 00:51:41,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +26: [2022-12-02 00:51:41,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +26: [2022-12-02 00:51:41,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. +26: [2022-12-02 00:51:41,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. +26: [2022-12-02 00:51:41,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt +26: [2022-12-02 00:51:41,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. +35: [2022-12-02 00:51:41,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt +35: [2022-12-02 00:51:41,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt +35: [2022-12-02 00:51:41,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt + 0: [2022-12-02 00:51:41,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. +35: [2022-12-02 00:51:41,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt + 0: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +35: [2022-12-02 00:51:41,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt +35: [2022-12-02 00:51:41,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +35: [2022-12-02 00:51:41,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +35: [2022-12-02 00:51:41,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. +40: [2022-12-02 00:51:41,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +40: [2022-12-02 00:51:41,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt +40: [2022-12-02 00:51:41,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt +40: [2022-12-02 00:51:41,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. +40: [2022-12-02 00:51:41,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt +40: [2022-12-02 00:51:41,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt +40: [2022-12-02 00:51:41,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt +40: [2022-12-02 00:51:41,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +40: [2022-12-02 00:51:41,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +38: [2022-12-02 00:51:41,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. +38: [2022-12-02 00:51:41,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt +38: [2022-12-02 00:51:41,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +35: [2022-12-02 00:51:41,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. +35: [2022-12-02 00:51:41,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt +35: [2022-12-02 00:51:41,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +62: [2022-12-02 00:51:41,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. +62: [2022-12-02 00:51:41,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt +62: [2022-12-02 00:51:41,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +48: [2022-12-02 00:51:41,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. +48: [2022-12-02 00:51:41,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt +48: [2022-12-02 00:51:41,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 1: [2022-12-02 00:51:41,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. + 1: [2022-12-02 00:51:41,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt + 1: [2022-12-02 00:51:41,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +44: [2022-12-02 00:51:41,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. +44: [2022-12-02 00:51:41,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt +44: [2022-12-02 00:51:41,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. +20: [2022-12-02 00:51:41,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt +20: [2022-12-02 00:51:41,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt +20: [2022-12-02 00:51:41,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt +20: [2022-12-02 00:51:41,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt +20: [2022-12-02 00:51:41,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +20: [2022-12-02 00:51:41,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +20: [2022-12-02 00:51:41,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +45: [2022-12-02 00:51:41,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. +45: [2022-12-02 00:51:41,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt +45: [2022-12-02 00:51:41,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +56: [2022-12-02 00:51:41,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. +56: [2022-12-02 00:51:41,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt +56: [2022-12-02 00:51:41,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +44: [2022-12-02 00:51:41,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. +44: [2022-12-02 00:51:41,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt +44: [2022-12-02 00:51:41,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +54: [2022-12-02 00:51:41,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. +54: [2022-12-02 00:51:41,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt +54: [2022-12-02 00:51:41,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +49: [2022-12-02 00:51:41,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. +49: [2022-12-02 00:51:41,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt +49: [2022-12-02 00:51:41,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +11: [2022-12-02 00:51:41,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +11: [2022-12-02 00:51:41,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +11: [2022-12-02 00:51:41,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 1: [2022-12-02 00:51:41,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. +20: [2022-12-02 00:51:41,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. +20: [2022-12-02 00:51:41,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt +20: [2022-12-02 00:51:41,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 1: [2022-12-02 00:51:41,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt + 1: [2022-12-02 00:51:41,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +63: [2022-12-02 00:51:41,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. +63: [2022-12-02 00:51:41,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt +63: [2022-12-02 00:51:41,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 7: [2022-12-02 00:51:41,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. + 7: [2022-12-02 00:51:41,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt + 7: [2022-12-02 00:51:41,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +26: [2022-12-02 00:51:41,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. +26: [2022-12-02 00:51:41,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt +26: [2022-12-02 00:51:41,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +29: [2022-12-02 00:51:41,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. +33: [2022-12-02 00:51:41,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. +33: [2022-12-02 00:51:41,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt +33: [2022-12-02 00:51:41,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +47: [2022-12-02 00:51:41,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. +47: [2022-12-02 00:51:41,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt +47: [2022-12-02 00:51:41,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +10: [2022-12-02 00:51:41,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +10: [2022-12-02 00:51:41,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +10: [2022-12-02 00:51:41,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +20: [2022-12-02 00:51:41,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. +20: [2022-12-02 00:51:41,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt +20: [2022-12-02 00:51:41,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +30: [2022-12-02 00:51:41,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. +30: [2022-12-02 00:51:41,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt +17: [2022-12-02 00:51:41,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. +30: [2022-12-02 00:51:41,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +17: [2022-12-02 00:51:41,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt +17: [2022-12-02 00:51:41,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +45: [2022-12-02 00:51:41,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. +45: [2022-12-02 00:51:41,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt +45: [2022-12-02 00:51:41,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +29: [2022-12-02 00:51:41,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt +29: [2022-12-02 00:51:41,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +24: [2022-12-02 00:51:41,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. +24: [2022-12-02 00:51:41,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt +24: [2022-12-02 00:51:41,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +27: [2022-12-02 00:51:41,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. +27: [2022-12-02 00:51:41,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt +27: [2022-12-02 00:51:41,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +19: [2022-12-02 00:51:41,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. +19: [2022-12-02 00:51:41,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt +19: [2022-12-02 00:51:41,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +14: [2022-12-02 00:51:41,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. +14: [2022-12-02 00:51:41,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt +14: [2022-12-02 00:51:41,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 6: [2022-12-02 00:51:41,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. + 6: [2022-12-02 00:51:41,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt + 7: [2022-12-02 00:51:41,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. + 6: [2022-12-02 00:51:41,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 7: [2022-12-02 00:51:41,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt + 7: [2022-12-02 00:51:41,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +18: [2022-12-02 00:51:41,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. +52: [2022-12-02 00:51:41,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. +18: [2022-12-02 00:51:41,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt +18: [2022-12-02 00:51:41,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +52: [2022-12-02 00:51:41,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt +52: [2022-12-02 00:51:41,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +15: [2022-12-02 00:51:41,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. +15: [2022-12-02 00:51:41,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt +15: [2022-12-02 00:51:41,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +11: [2022-12-02 00:51:41,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. +11: [2022-12-02 00:51:41,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt +12: [2022-12-02 00:51:41,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +11: [2022-12-02 00:51:41,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +12: [2022-12-02 00:51:41,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +12: [2022-12-02 00:51:41,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 5: [2022-12-02 00:51:41,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. + 5: [2022-12-02 00:51:41,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt + 8: [2022-12-02 00:51:41,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. + 5: [2022-12-02 00:51:41,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 8: [2022-12-02 00:51:41,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt + 8: [2022-12-02 00:51:41,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +53: [2022-12-02 00:51:41,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. +53: [2022-12-02 00:51:41,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt +53: [2022-12-02 00:51:41,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 3: [2022-12-02 00:51:41,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. + 3: [2022-12-02 00:51:41,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt + 3: [2022-12-02 00:51:41,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +31: [2022-12-02 00:51:41,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. +31: [2022-12-02 00:51:41,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt +31: [2022-12-02 00:51:41,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +22: [2022-12-02 00:51:41,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. +22: [2022-12-02 00:51:41,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt +22: [2022-12-02 00:51:41,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. +37: [2022-12-02 00:51:41,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. +37: [2022-12-02 00:51:41,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt + 9: [2022-12-02 00:51:41,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +37: [2022-12-02 00:51:41,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 9: [2022-12-02 00:51:41,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt + 9: [2022-12-02 00:51:41,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +16: [2022-12-02 00:51:41,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. +16: [2022-12-02 00:51:41,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt +16: [2022-12-02 00:51:41,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +17: [2022-12-02 00:51:41,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. +17: [2022-12-02 00:51:41,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt +17: [2022-12-02 00:51:41,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +57: [2022-12-02 00:51:41,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. +57: [2022-12-02 00:51:41,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt +57: [2022-12-02 00:51:41,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +25: [2022-12-02 00:51:41,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. +25: [2022-12-02 00:51:41,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt +25: [2022-12-02 00:51:41,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +23: [2022-12-02 00:51:41,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. +23: [2022-12-02 00:51:41,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt +23: [2022-12-02 00:51:41,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +63: [2022-12-02 00:51:41,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. +63: [2022-12-02 00:51:41,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt +63: [2022-12-02 00:51:41,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +30: [2022-12-02 00:51:41,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. +30: [2022-12-02 00:51:41,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt +30: [2022-12-02 00:51:41,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +10: [2022-12-02 00:51:41,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +10: [2022-12-02 00:51:41,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +10: [2022-12-02 00:51:41,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 6: [2022-12-02 00:51:41,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. + 6: [2022-12-02 00:51:41,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt + 6: [2022-12-02 00:51:41,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +15: [2022-12-02 00:51:41,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. +15: [2022-12-02 00:51:41,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt +15: [2022-12-02 00:51:41,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +13: [2022-12-02 00:51:41,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +13: [2022-12-02 00:51:41,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +13: [2022-12-02 00:51:41,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +21: [2022-12-02 00:51:41,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. +21: [2022-12-02 00:51:41,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt +21: [2022-12-02 00:51:41,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 2: [2022-12-02 00:51:41,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. + 2: [2022-12-02 00:51:41,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt + 2: [2022-12-02 00:51:41,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +16: [2022-12-02 00:51:41,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. +16: [2022-12-02 00:51:41,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt +16: [2022-12-02 00:51:41,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +33: [2022-12-02 00:51:41,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. +33: [2022-12-02 00:51:41,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt +33: [2022-12-02 00:51:41,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt +24: [2022-12-02 00:51:41,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. +39: [2022-12-02 00:51:41,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +24: [2022-12-02 00:51:41,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt +24: [2022-12-02 00:51:41,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 4: [2022-12-02 00:51:41,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. + 4: [2022-12-02 00:51:41,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt + 4: [2022-12-02 00:51:41,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +29: [2022-12-02 00:51:41,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. +29: [2022-12-02 00:51:41,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt +29: [2022-12-02 00:51:41,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +53: [2022-12-02 00:51:41,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. +53: [2022-12-02 00:51:41,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt +53: [2022-12-02 00:51:41,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +57: [2022-12-02 00:51:41,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. +57: [2022-12-02 00:51:41,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt +57: [2022-12-02 00:51:41,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +25: [2022-12-02 00:51:41,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. +25: [2022-12-02 00:51:41,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt +25: [2022-12-02 00:51:41,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 8: [2022-12-02 00:51:41,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. + 8: [2022-12-02 00:51:41,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt + 5: [2022-12-02 00:51:41,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. + 8: [2022-12-02 00:51:41,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 5: [2022-12-02 00:51:41,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt + 5: [2022-12-02 00:51:41,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +22: [2022-12-02 00:51:41,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. +22: [2022-12-02 00:51:41,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt +22: [2022-12-02 00:51:41,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +28: [2022-12-02 00:51:41,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. +28: [2022-12-02 00:51:41,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt +28: [2022-12-02 00:51:41,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +17: [2022-12-02 00:51:41,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. +17: [2022-12-02 00:51:41,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt +17: [2022-12-02 00:51:41,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 3: [2022-12-02 00:51:41,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. + 3: [2022-12-02 00:51:41,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt + 3: [2022-12-02 00:51:41,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 2: [2022-12-02 00:51:41,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. + 2: [2022-12-02 00:51:41,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt + 2: [2022-12-02 00:51:41,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +23: [2022-12-02 00:51:41,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. +23: [2022-12-02 00:51:41,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt +23: [2022-12-02 00:51:41,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 9: [2022-12-02 00:51:41,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. + 9: [2022-12-02 00:51:41,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt + 9: [2022-12-02 00:51:41,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +15: [2022-12-02 00:51:41,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +13: [2022-12-02 00:51:41,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. +15: [2022-12-02 00:51:41,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +13: [2022-12-02 00:51:41,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt +15: [2022-12-02 00:51:41,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +13: [2022-12-02 00:51:41,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +11: [2022-12-02 00:51:41,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +11: [2022-12-02 00:51:41,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +11: [2022-12-02 00:51:41,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +14: [2022-12-02 00:51:41,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +14: [2022-12-02 00:51:41,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +29: [2022-12-02 00:51:41,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. +14: [2022-12-02 00:51:41,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +29: [2022-12-02 00:51:41,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt +29: [2022-12-02 00:51:41,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +30: [2022-12-02 00:51:41,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. +30: [2022-12-02 00:51:41,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt +30: [2022-12-02 00:51:41,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +16: [2022-12-02 00:51:41,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. +16: [2022-12-02 00:51:41,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt +16: [2022-12-02 00:51:41,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +18: [2022-12-02 00:51:41,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. +18: [2022-12-02 00:51:41,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. +18: [2022-12-02 00:51:41,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt +18: [2022-12-02 00:51:41,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt +18: [2022-12-02 00:51:41,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +18: [2022-12-02 00:51:41,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +25: [2022-12-02 00:51:41,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. +25: [2022-12-02 00:51:41,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt +25: [2022-12-02 00:51:41,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 8: [2022-12-02 00:51:41,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. + 8: [2022-12-02 00:51:41,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt + 8: [2022-12-02 00:51:41,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 9: [2022-12-02 00:51:41,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. + 9: [2022-12-02 00:51:41,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt + 9: [2022-12-02 00:51:41,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +21: [2022-12-02 00:51:41,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. +22: [2022-12-02 00:51:41,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. +21: [2022-12-02 00:51:41,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt +21: [2022-12-02 00:51:41,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +60: [2022-12-02 00:51:41,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. +22: [2022-12-02 00:51:41,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt +60: [2022-12-02 00:51:41,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt +22: [2022-12-02 00:51:41,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +60: [2022-12-02 00:51:41,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +31: [2022-12-02 00:51:41,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. +31: [2022-12-02 00:51:41,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt +31: [2022-12-02 00:51:41,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 6: [2022-12-02 00:51:41,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. + 6: [2022-12-02 00:51:41,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt + 6: [2022-12-02 00:51:41,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +12: [2022-12-02 00:51:41,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +12: [2022-12-02 00:51:41,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +12: [2022-12-02 00:51:41,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +42: [2022-12-02 00:51:41,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. +42: [2022-12-02 00:51:41,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt +42: [2022-12-02 00:51:41,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +23: [2022-12-02 00:51:41,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. +23: [2022-12-02 00:51:41,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt +23: [2022-12-02 00:51:41,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +21: [2022-12-02 00:51:41,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. +21: [2022-12-02 00:51:41,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt +21: [2022-12-02 00:51:41,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +14: [2022-12-02 00:51:41,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +14: [2022-12-02 00:51:41,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +14: [2022-12-02 00:51:41,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +13: [2022-12-02 00:51:41,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +13: [2022-12-02 00:51:41,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +13: [2022-12-02 00:51:41,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 5: [2022-12-02 00:51:41,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. + 5: [2022-12-02 00:51:41,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt + 5: [2022-12-02 00:51:41,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +12: [2022-12-02 00:51:41,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. +12: [2022-12-02 00:51:41,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt +12: [2022-12-02 00:51:41,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. +31: [2022-12-02 00:51:41,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. +31: [2022-12-02 00:51:41,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt +31: [2022-12-02 00:51:41,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 4: [2022-12-02 00:51:41,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. + 4: [2022-12-02 00:51:41,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt + 4: [2022-12-02 00:51:41,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +37: [2022-12-02 00:51:41,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. +37: [2022-12-02 00:51:41,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt +37: [2022-12-02 00:51:41,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 4: [2022-12-02 00:51:41,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. + 4: [2022-12-02 00:51:41,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt + 4: [2022-12-02 00:51:41,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +37: [2022-12-02 00:51:41,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. +37: [2022-12-02 00:51:41,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt +37: [2022-12-02 00:51:41,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt +39: [2022-12-02 00:51:41,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +39: [2022-12-02 00:51:41,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. +39: [2022-12-02 00:51:41,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt +39: [2022-12-02 00:51:41,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 3: [2022-12-02 00:51:41,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. + 3: [2022-12-02 00:51:41,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt + 3: [2022-12-02 00:51:41,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 1: [2022-12-02 00:51:41,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. + 1: [2022-12-02 00:51:41,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt + 1: [2022-12-02 00:51:41,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 1: [2022-12-02 00:51:41,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. + 1: [2022-12-02 00:51:41,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt + 1: [2022-12-02 00:51:41,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +48: [2022-12-02 00:51:41,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. +48: [2022-12-02 00:51:41,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt +48: [2022-12-02 00:51:41,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +24: [2022-12-02 00:51:41,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. +24: [2022-12-02 00:51:41,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt +24: [2022-12-02 00:51:41,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +10: [2022-12-02 00:51:41,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +10: [2022-12-02 00:51:41,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +10: [2022-12-02 00:51:41,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +42: [2022-12-02 00:51:41,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. +42: [2022-12-02 00:51:41,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt +42: [2022-12-02 00:51:41,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +28: [2022-12-02 00:51:41,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. +28: [2022-12-02 00:51:41,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step4000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt +28: [2022-12-02 00:51:41,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! + 0: successfully saved checkpoint at iteration 4000 to checkpoints_8b7beta +63: time (ms) | save-checkpoint: 7799.84 +63: iteration 4010/ 5494 | consumed samples: 4106240 | consumed tokens: 8409579520 | elapsed time per iteration (s): 6.42 | learning rate: 5.109E-05 | global batch size: 1024 | lm loss: 2.195080E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 159.556 | TFLOPs: 35.67 | +63: iteration 4020/ 5494 | consumed samples: 4116480 | consumed tokens: 8430551040 | elapsed time per iteration (s): 5.73 | learning rate: 5.070E-05 | global batch size: 1024 | lm loss: 2.191749E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.849 | TFLOPs: 39.98 | +63: iteration 4030/ 5494 | consumed samples: 4126720 | consumed tokens: 8451522560 | elapsed time per iteration (s): 5.72 | learning rate: 5.031E-05 | global batch size: 1024 | lm loss: 2.194952E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.957 | TFLOPs: 40.01 | +63: iteration 4040/ 5494 | consumed samples: 4136960 | consumed tokens: 8472494080 | elapsed time per iteration (s): 5.64 | learning rate: 4.992E-05 | global batch size: 1024 | lm loss: 2.182461E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.466 | TFLOPs: 40.57 | +63: iteration 4050/ 5494 | consumed samples: 4147200 | consumed tokens: 8493465600 | elapsed time per iteration (s): 5.63 | learning rate: 4.953E-05 | global batch size: 1024 | lm loss: 2.183336E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.007 | TFLOPs: 40.69 | +63: iteration 4060/ 5494 | consumed samples: 4157440 | consumed tokens: 8514437120 | elapsed time per iteration (s): 5.76 | learning rate: 4.915E-05 | global batch size: 1024 | lm loss: 2.177393E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.759 | TFLOPs: 39.74 | +63: iteration 4070/ 5494 | consumed samples: 4167680 | consumed tokens: 8535408640 | elapsed time per iteration (s): 6.17 | learning rate: 4.877E-05 | global batch size: 1024 | lm loss: 2.180670E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 165.912 | TFLOPs: 37.09 | +63: iteration 4080/ 5494 | consumed samples: 4177920 | consumed tokens: 8556380160 | elapsed time per iteration (s): 5.42 | learning rate: 4.839E-05 | global batch size: 1024 | lm loss: 2.193736E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.886 | TFLOPs: 42.23 | +63: iteration 4090/ 5494 | consumed samples: 4188160 | consumed tokens: 8577351680 | elapsed time per iteration (s): 5.73 | learning rate: 4.801E-05 | global batch size: 1024 | lm loss: 2.186554E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.585 | TFLOPs: 39.93 | +63: iteration 4100/ 5494 | consumed samples: 4198400 | consumed tokens: 8598323200 | elapsed time per iteration (s): 5.74 | learning rate: 4.763E-05 | global batch size: 1024 | lm loss: 2.200090E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.292 | TFLOPs: 39.86 | +63: iteration 4110/ 5494 | consumed samples: 4208640 | consumed tokens: 8619294720 | elapsed time per iteration (s): 5.92 | learning rate: 4.726E-05 | global batch size: 1024 | lm loss: 2.182776E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.962 | TFLOPs: 38.67 | +63: iteration 4120/ 5494 | consumed samples: 4218880 | consumed tokens: 8640266240 | elapsed time per iteration (s): 5.50 | learning rate: 4.689E-05 | global batch size: 1024 | lm loss: 2.186254E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.144 | TFLOPs: 41.62 | +63: iteration 4130/ 5494 | consumed samples: 4229120 | consumed tokens: 8661237760 | elapsed time per iteration (s): 5.87 | learning rate: 4.652E-05 | global batch size: 1024 | lm loss: 2.185421E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.513 | TFLOPs: 39.02 | +63: iteration 4140/ 5494 | consumed samples: 4239360 | consumed tokens: 8682209280 | elapsed time per iteration (s): 5.69 | learning rate: 4.615E-05 | global batch size: 1024 | lm loss: 2.181460E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.101 | TFLOPs: 40.26 | +63: iteration 4150/ 5494 | consumed samples: 4249600 | consumed tokens: 8703180800 | elapsed time per iteration (s): 5.80 | learning rate: 4.579E-05 | global batch size: 1024 | lm loss: 2.179684E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.667 | TFLOPs: 39.50 | +63: iteration 4160/ 5494 | consumed samples: 4259840 | consumed tokens: 8724152320 | elapsed time per iteration (s): 5.66 | learning rate: 4.542E-05 | global batch size: 1024 | lm loss: 2.190926E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.980 | TFLOPs: 40.46 | +63: iteration 4170/ 5494 | consumed samples: 4270080 | consumed tokens: 8745123840 | elapsed time per iteration (s): 5.75 | learning rate: 4.506E-05 | global batch size: 1024 | lm loss: 2.173369E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.143 | TFLOPs: 39.83 | +63: iteration 4180/ 5494 | consumed samples: 4280320 | consumed tokens: 8766095360 | elapsed time per iteration (s): 5.43 | learning rate: 4.470E-05 | global batch size: 1024 | lm loss: 2.166966E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.479 | TFLOPs: 42.14 | +63: iteration 4190/ 5494 | consumed samples: 4290560 | consumed tokens: 8787066880 | elapsed time per iteration (s): 5.39 | learning rate: 4.435E-05 | global batch size: 1024 | lm loss: 2.192517E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.151 | TFLOPs: 42.51 | +63: iteration 4200/ 5494 | consumed samples: 4300800 | consumed tokens: 8808038400 | elapsed time per iteration (s): 5.60 | learning rate: 4.399E-05 | global batch size: 1024 | lm loss: 2.169495E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.697 | TFLOPs: 40.84 | +63: iteration 4210/ 5494 | consumed samples: 4311040 | consumed tokens: 8829009920 | elapsed time per iteration (s): 5.64 | learning rate: 4.364E-05 | global batch size: 1024 | lm loss: 2.161168E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.667 | TFLOPs: 40.61 | +63: iteration 4220/ 5494 | consumed samples: 4321280 | consumed tokens: 8849981440 | elapsed time per iteration (s): 5.59 | learning rate: 4.329E-05 | global batch size: 1024 | lm loss: 2.177979E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.228 | TFLOPs: 40.96 | +63: iteration 4230/ 5494 | consumed samples: 4331520 | consumed tokens: 8870952960 | elapsed time per iteration (s): 5.61 | learning rate: 4.294E-05 | global batch size: 1024 | lm loss: 2.166492E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.526 | TFLOPs: 40.81 | +63: iteration 4240/ 5494 | consumed samples: 4341760 | consumed tokens: 8891924480 | elapsed time per iteration (s): 5.83 | learning rate: 4.260E-05 | global batch size: 1024 | lm loss: 2.165850E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.537 | TFLOPs: 39.24 | +63: iteration 4250/ 5494 | consumed samples: 4352000 | consumed tokens: 8912896000 | elapsed time per iteration (s): 5.58 | learning rate: 4.225E-05 | global batch size: 1024 | lm loss: 2.188190E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.595 | TFLOPs: 41.05 | +63: iteration 4260/ 5494 | consumed samples: 4362240 | consumed tokens: 8933867520 | elapsed time per iteration (s): 5.51 | learning rate: 4.191E-05 | global batch size: 1024 | lm loss: 2.190292E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.733 | TFLOPs: 41.52 | +63: iteration 4270/ 5494 | consumed samples: 4372480 | consumed tokens: 8954839040 | elapsed time per iteration (s): 5.62 | learning rate: 4.157E-05 | global batch size: 1024 | lm loss: 2.170879E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.285 | TFLOPs: 40.75 | +63: iteration 4280/ 5494 | consumed samples: 4382720 | consumed tokens: 8975810560 | elapsed time per iteration (s): 5.63 | learning rate: 4.124E-05 | global batch size: 1024 | lm loss: 2.181679E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.820 | TFLOPs: 40.65 | +63: iteration 4290/ 5494 | consumed samples: 4392960 | consumed tokens: 8996782080 | elapsed time per iteration (s): 5.65 | learning rate: 4.090E-05 | global batch size: 1024 | lm loss: 2.167886E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.131 | TFLOPs: 40.49 | +63: iteration 4300/ 5494 | consumed samples: 4403200 | consumed tokens: 9017753600 | elapsed time per iteration (s): 5.83 | learning rate: 4.057E-05 | global batch size: 1024 | lm loss: 2.171222E+00 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.499 | TFLOPs: 39.24 | +63: iteration 4310/ 5494 | consumed samples: 4413440 | consumed tokens: 9038725120 | elapsed time per iteration (s): 5.53 | learning rate: 4.024E-05 | global batch size: 1024 | lm loss: 2.174121E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.088 | TFLOPs: 41.38 | +63: iteration 4320/ 5494 | consumed samples: 4423680 | consumed tokens: 9059696640 | elapsed time per iteration (s): 5.48 | learning rate: 3.991E-05 | global batch size: 1024 | lm loss: 2.176246E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.702 | TFLOPs: 41.74 | +63: iteration 4330/ 5494 | consumed samples: 4433920 | consumed tokens: 9080668160 | elapsed time per iteration (s): 5.50 | learning rate: 3.959E-05 | global batch size: 1024 | lm loss: 2.170661E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.024 | TFLOPs: 41.59 | +63: iteration 4340/ 5494 | consumed samples: 4444160 | consumed tokens: 9101639680 | elapsed time per iteration (s): 5.38 | learning rate: 3.927E-05 | global batch size: 1024 | lm loss: 2.161198E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.314 | TFLOPs: 42.55 | +63: iteration 4350/ 5494 | consumed samples: 4454400 | consumed tokens: 9122611200 | elapsed time per iteration (s): 5.73 | learning rate: 3.895E-05 | global batch size: 1024 | lm loss: 2.177530E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.624 | TFLOPs: 39.93 | +63: iteration 4360/ 5494 | consumed samples: 4464640 | consumed tokens: 9143582720 | elapsed time per iteration (s): 5.71 | learning rate: 3.863E-05 | global batch size: 1024 | lm loss: 2.169451E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.368 | TFLOPs: 40.10 | +63: iteration 4370/ 5494 | consumed samples: 4474880 | consumed tokens: 9164554240 | elapsed time per iteration (s): 5.61 | learning rate: 3.831E-05 | global batch size: 1024 | lm loss: 2.170414E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.614 | TFLOPs: 40.83 | +63: iteration 4380/ 5494 | consumed samples: 4485120 | consumed tokens: 9185525760 | elapsed time per iteration (s): 5.56 | learning rate: 3.800E-05 | global batch size: 1024 | lm loss: 2.166671E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.134 | TFLOPs: 41.17 | +63: iteration 4390/ 5494 | consumed samples: 4495360 | consumed tokens: 9206497280 | elapsed time per iteration (s): 5.97 | learning rate: 3.769E-05 | global batch size: 1024 | lm loss: 2.184775E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.445 | TFLOPs: 38.33 | +63: iteration 4400/ 5494 | consumed samples: 4505600 | consumed tokens: 9227468800 | elapsed time per iteration (s): 5.61 | learning rate: 3.738E-05 | global batch size: 1024 | lm loss: 2.169910E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.573 | TFLOPs: 40.82 | +63: iteration 4410/ 5494 | consumed samples: 4515840 | consumed tokens: 9248440320 | elapsed time per iteration (s): 5.64 | learning rate: 3.708E-05 | global batch size: 1024 | lm loss: 2.166733E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.694 | TFLOPs: 40.62 | +63: iteration 4420/ 5494 | consumed samples: 4526080 | consumed tokens: 9269411840 | elapsed time per iteration (s): 5.54 | learning rate: 3.677E-05 | global batch size: 1024 | lm loss: 2.164046E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.804 | TFLOPs: 41.32 | +63: iteration 4430/ 5494 | consumed samples: 4536320 | consumed tokens: 9290383360 | elapsed time per iteration (s): 5.54 | learning rate: 3.647E-05 | global batch size: 1024 | lm loss: 2.177029E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.852 | TFLOPs: 41.33 | +63: iteration 4440/ 5494 | consumed samples: 4546560 | consumed tokens: 9311354880 | elapsed time per iteration (s): 5.75 | learning rate: 3.617E-05 | global batch size: 1024 | lm loss: 2.169613E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.935 | TFLOPs: 39.78 | +63: iteration 4450/ 5494 | consumed samples: 4556800 | consumed tokens: 9332326400 | elapsed time per iteration (s): 5.65 | learning rate: 3.588E-05 | global batch size: 1024 | lm loss: 2.173423E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.310 | TFLOPs: 40.53 | +63: iteration 4460/ 5494 | consumed samples: 4567040 | consumed tokens: 9353297920 | elapsed time per iteration (s): 6.07 | learning rate: 3.558E-05 | global batch size: 1024 | lm loss: 2.171627E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.761 | TFLOPs: 37.73 | +63: iteration 4470/ 5494 | consumed samples: 4577280 | consumed tokens: 9374269440 | elapsed time per iteration (s): 5.60 | learning rate: 3.529E-05 | global batch size: 1024 | lm loss: 2.167310E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.861 | TFLOPs: 40.88 | +63: iteration 4480/ 5494 | consumed samples: 4587520 | consumed tokens: 9395240960 | elapsed time per iteration (s): 5.49 | learning rate: 3.500E-05 | global batch size: 1024 | lm loss: 2.159710E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.616 | TFLOPs: 41.72 | +63: iteration 4490/ 5494 | consumed samples: 4597760 | consumed tokens: 9416212480 | elapsed time per iteration (s): 5.73 | learning rate: 3.472E-05 | global batch size: 1024 | lm loss: 2.170429E+00 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.659 | TFLOPs: 39.94 | +63: iteration 4500/ 5494 | consumed samples: 4608000 | consumed tokens: 9437184000 | elapsed time per iteration (s): 5.63 | learning rate: 3.443E-05 | global batch size: 1024 | lm loss: 2.169520E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.001 | TFLOPs: 40.69 | +63: iteration 4510/ 5494 | consumed samples: 4618240 | consumed tokens: 9458155520 | elapsed time per iteration (s): 5.50 | learning rate: 3.415E-05 | global batch size: 1024 | lm loss: 2.169555E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.160 | TFLOPs: 41.62 | +63: iteration 4520/ 5494 | consumed samples: 4628480 | consumed tokens: 9479127040 | elapsed time per iteration (s): 5.61 | learning rate: 3.387E-05 | global batch size: 1024 | lm loss: 2.159646E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.567 | TFLOPs: 40.82 | +63: iteration 4530/ 5494 | consumed samples: 4638720 | consumed tokens: 9500098560 | elapsed time per iteration (s): 5.87 | learning rate: 3.360E-05 | global batch size: 1024 | lm loss: 2.180525E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.483 | TFLOPs: 39.01 | +63: iteration 4540/ 5494 | consumed samples: 4648960 | consumed tokens: 9521070080 | elapsed time per iteration (s): 5.51 | learning rate: 3.332E-05 | global batch size: 1024 | lm loss: 2.147970E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.739 | TFLOPs: 41.53 | +63: iteration 4550/ 5494 | consumed samples: 4659200 | consumed tokens: 9542041600 | elapsed time per iteration (s): 5.84 | learning rate: 3.305E-05 | global batch size: 1024 | lm loss: 2.157597E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.284 | TFLOPs: 39.19 | +63: iteration 4560/ 5494 | consumed samples: 4669440 | consumed tokens: 9563013120 | elapsed time per iteration (s): 5.64 | learning rate: 3.278E-05 | global batch size: 1024 | lm loss: 2.151907E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.464 | TFLOPs: 40.57 | +63: iteration 4570/ 5494 | consumed samples: 4679680 | consumed tokens: 9583984640 | elapsed time per iteration (s): 5.61 | learning rate: 3.252E-05 | global batch size: 1024 | lm loss: 2.154163E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.586 | TFLOPs: 40.82 | +63: iteration 4580/ 5494 | consumed samples: 4689920 | consumed tokens: 9604956160 | elapsed time per iteration (s): 5.54 | learning rate: 3.226E-05 | global batch size: 1024 | lm loss: 2.168870E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.878 | TFLOPs: 41.33 | +63: iteration 4590/ 5494 | consumed samples: 4700160 | consumed tokens: 9625927680 | elapsed time per iteration (s): 5.79 | learning rate: 3.200E-05 | global batch size: 1024 | lm loss: 2.159379E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.789 | TFLOPs: 39.52 | +63: iteration 4600/ 5494 | consumed samples: 4710400 | consumed tokens: 9646899200 | elapsed time per iteration (s): 5.61 | learning rate: 3.174E-05 | global batch size: 1024 | lm loss: 2.175089E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.419 | TFLOPs: 40.78 | +63: iteration 4610/ 5494 | consumed samples: 4720640 | consumed tokens: 9667870720 | elapsed time per iteration (s): 5.50 | learning rate: 3.148E-05 | global batch size: 1024 | lm loss: 2.162406E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.057 | TFLOPs: 41.60 | +63: iteration 4620/ 5494 | consumed samples: 4730880 | consumed tokens: 9688842240 | elapsed time per iteration (s): 5.63 | learning rate: 3.123E-05 | global batch size: 1024 | lm loss: 2.141910E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.021 | TFLOPs: 40.69 | +63: iteration 4630/ 5494 | consumed samples: 4741120 | consumed tokens: 9709813760 | elapsed time per iteration (s): 5.78 | learning rate: 3.098E-05 | global batch size: 1024 | lm loss: 2.152077E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.278 | TFLOPs: 39.63 | +63: iteration 4640/ 5494 | consumed samples: 4751360 | consumed tokens: 9730785280 | elapsed time per iteration (s): 5.53 | learning rate: 3.073E-05 | global batch size: 1024 | lm loss: 2.169696E+00 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.067 | TFLOPs: 41.37 | +63: iteration 4650/ 5494 | consumed samples: 4761600 | consumed tokens: 9751756800 | elapsed time per iteration (s): 5.38 | learning rate: 3.049E-05 | global batch size: 1024 | lm loss: 2.150908E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.259 | TFLOPs: 42.54 | +63: iteration 4660/ 5494 | consumed samples: 4771840 | consumed tokens: 9772728320 | elapsed time per iteration (s): 5.52 | learning rate: 3.024E-05 | global batch size: 1024 | lm loss: 2.163811E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.625 | TFLOPs: 41.50 | +63: iteration 4670/ 5494 | consumed samples: 4782080 | consumed tokens: 9793699840 | elapsed time per iteration (s): 5.74 | learning rate: 3.000E-05 | global batch size: 1024 | lm loss: 2.169174E+00 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.447 | TFLOPs: 39.89 | +63: iteration 4680/ 5494 | consumed samples: 4792320 | consumed tokens: 9814671360 | elapsed time per iteration (s): 5.49 | learning rate: 2.977E-05 | global batch size: 1024 | lm loss: 2.162837E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.523 | TFLOPs: 41.70 | +63: iteration 4690/ 5494 | consumed samples: 4802560 | consumed tokens: 9835642880 | elapsed time per iteration (s): 5.69 | learning rate: 2.953E-05 | global batch size: 1024 | lm loss: 2.161412E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.872 | TFLOPs: 40.21 | +63: iteration 4700/ 5494 | consumed samples: 4812800 | consumed tokens: 9856614400 | elapsed time per iteration (s): 5.47 | learning rate: 2.930E-05 | global batch size: 1024 | lm loss: 2.154110E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.186 | TFLOPs: 41.85 | +63: iteration 4710/ 5494 | consumed samples: 4823040 | consumed tokens: 9877585920 | elapsed time per iteration (s): 5.77 | learning rate: 2.907E-05 | global batch size: 1024 | lm loss: 2.144799E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.500 | TFLOPs: 39.68 | +63: iteration 4720/ 5494 | consumed samples: 4833280 | consumed tokens: 9898557440 | elapsed time per iteration (s): 5.57 | learning rate: 2.885E-05 | global batch size: 1024 | lm loss: 2.154534E+00 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.000 | TFLOPs: 41.14 | +63: iteration 4730/ 5494 | consumed samples: 4843520 | consumed tokens: 9919528960 | elapsed time per iteration (s): 5.51 | learning rate: 2.862E-05 | global batch size: 1024 | lm loss: 2.154471E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.008 | TFLOPs: 41.59 | +63: iteration 4740/ 5494 | consumed samples: 4853760 | consumed tokens: 9940500480 | elapsed time per iteration (s): 5.74 | learning rate: 2.840E-05 | global batch size: 1024 | lm loss: 2.155236E+00 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.422 | TFLOPs: 39.89 | +63: iteration 4750/ 5494 | consumed samples: 4864000 | consumed tokens: 9961472000 | elapsed time per iteration (s): 5.57 | learning rate: 2.819E-05 | global batch size: 1024 | lm loss: 2.172292E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.953 | TFLOPs: 41.13 | +63: iteration 4760/ 5494 | consumed samples: 4874240 | consumed tokens: 9982443520 | elapsed time per iteration (s): 5.54 | learning rate: 2.797E-05 | global batch size: 1024 | lm loss: 2.166580E+00 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.893 | TFLOPs: 41.34 | +63: iteration 4770/ 5494 | consumed samples: 4884480 | consumed tokens: 10003415040 | elapsed time per iteration (s): 5.66 | learning rate: 2.776E-05 | global batch size: 1024 | lm loss: 2.146791E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.065 | TFLOPs: 40.48 | +63: iteration 4780/ 5494 | consumed samples: 4894720 | consumed tokens: 10024386560 | elapsed time per iteration (s): 5.62 | learning rate: 2.755E-05 | global batch size: 1024 | lm loss: 2.174049E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.159 | TFLOPs: 40.72 | +63: iteration 4790/ 5494 | consumed samples: 4904960 | consumed tokens: 10045358080 | elapsed time per iteration (s): 5.62 | learning rate: 2.734E-05 | global batch size: 1024 | lm loss: 2.148613E+00 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.075 | TFLOPs: 40.71 | +63: iteration 4800/ 5494 | consumed samples: 4915200 | consumed tokens: 10066329600 | elapsed time per iteration (s): 5.38 | learning rate: 2.714E-05 | global batch size: 1024 | lm loss: 2.156477E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.470 | TFLOPs: 42.58 | +63: iteration 4810/ 5494 | consumed samples: 4925440 | consumed tokens: 10087301120 | elapsed time per iteration (s): 5.63 | learning rate: 2.694E-05 | global batch size: 1024 | lm loss: 2.156234E+00 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.961 | TFLOPs: 40.68 | +63: iteration 4820/ 5494 | consumed samples: 4935680 | consumed tokens: 10108272640 | elapsed time per iteration (s): 5.54 | learning rate: 2.674E-05 | global batch size: 1024 | lm loss: 2.152144E+00 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.917 | TFLOPs: 41.34 | +63: iteration 4830/ 5494 | consumed samples: 4945920 | consumed tokens: 10129244160 | elapsed time per iteration (s): 5.64 | learning rate: 2.654E-05 | global batch size: 1024 | lm loss: 2.147784E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.421 | TFLOPs: 40.56 | +63: iteration 4840/ 5494 | consumed samples: 4956160 | consumed tokens: 10150215680 | elapsed time per iteration (s): 5.55 | learning rate: 2.635E-05 | global batch size: 1024 | lm loss: 2.144873E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.433 | TFLOPs: 41.23 | +63: iteration 4850/ 5494 | consumed samples: 4966400 | consumed tokens: 10171187200 | elapsed time per iteration (s): 5.71 | learning rate: 2.616E-05 | global batch size: 1024 | lm loss: 2.146215E+00 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.291 | TFLOPs: 40.08 | +63: iteration 4860/ 5494 | consumed samples: 4976640 | consumed tokens: 10192158720 | elapsed time per iteration (s): 5.50 | learning rate: 2.597E-05 | global batch size: 1024 | lm loss: 2.161773E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.130 | TFLOPs: 41.61 | +63: iteration 4870/ 5494 | consumed samples: 4986880 | consumed tokens: 10213130240 | elapsed time per iteration (s): 5.55 | learning rate: 2.578E-05 | global batch size: 1024 | lm loss: 2.145215E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.552 | TFLOPs: 41.26 | +63: iteration 4880/ 5494 | consumed samples: 4997120 | consumed tokens: 10234101760 | elapsed time per iteration (s): 5.59 | learning rate: 2.560E-05 | global batch size: 1024 | lm loss: 2.156174E+00 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.048 | TFLOPs: 40.92 | +63: iteration 4890/ 5494 | consumed samples: 5007360 | consumed tokens: 10255073280 | elapsed time per iteration (s): 5.51 | learning rate: 2.542E-05 | global batch size: 1024 | lm loss: 2.153667E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.787 | TFLOPs: 41.54 | +63: iteration 4900/ 5494 | consumed samples: 5017600 | consumed tokens: 10276044800 | elapsed time per iteration (s): 5.62 | learning rate: 2.525E-05 | global batch size: 1024 | lm loss: 2.136637E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.166 | TFLOPs: 40.73 | +63: iteration 4910/ 5494 | consumed samples: 5027840 | consumed tokens: 10297016320 | elapsed time per iteration (s): 5.47 | learning rate: 2.507E-05 | global batch size: 1024 | lm loss: 2.162029E+00 | grad norm: 0.112 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.204 | TFLOPs: 41.85 | +63: iteration 4920/ 5494 | consumed samples: 5038080 | consumed tokens: 10317987840 | elapsed time per iteration (s): 5.73 | learning rate: 2.490E-05 | global batch size: 1024 | lm loss: 2.136198E+00 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.859 | TFLOPs: 39.99 | +63: iteration 4930/ 5494 | consumed samples: 5048320 | consumed tokens: 10338959360 | elapsed time per iteration (s): 5.51 | learning rate: 2.474E-05 | global batch size: 1024 | lm loss: 2.136574E+00 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.862 | TFLOPs: 41.55 | +63: iteration 4940/ 5494 | consumed samples: 5058560 | consumed tokens: 10359930880 | elapsed time per iteration (s): 5.61 | learning rate: 2.457E-05 | global batch size: 1024 | lm loss: 2.152442E+00 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.678 | TFLOPs: 40.84 | +63: iteration 4950/ 5494 | consumed samples: 5068800 | consumed tokens: 10380902400 | elapsed time per iteration (s): 5.50 | learning rate: 2.441E-05 | global batch size: 1024 | lm loss: 2.145054E+00 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.185 | TFLOPs: 41.62 | +63: iteration 4960/ 5494 | consumed samples: 5079040 | consumed tokens: 10401873920 | elapsed time per iteration (s): 5.47 | learning rate: 2.425E-05 | global batch size: 1024 | lm loss: 2.139309E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.066 | TFLOPs: 41.82 | +63: iteration 4970/ 5494 | consumed samples: 5089280 | consumed tokens: 10422845440 | elapsed time per iteration (s): 5.73 | learning rate: 2.409E-05 | global batch size: 1024 | lm loss: 2.160622E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.731 | TFLOPs: 39.96 | +63: iteration 4980/ 5494 | consumed samples: 5099520 | consumed tokens: 10443816960 | elapsed time per iteration (s): 5.50 | learning rate: 2.394E-05 | global batch size: 1024 | lm loss: 2.158132E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.133 | TFLOPs: 41.61 | +63: iteration 4990/ 5494 | consumed samples: 5109760 | consumed tokens: 10464788480 | elapsed time per iteration (s): 5.64 | learning rate: 2.379E-05 | global batch size: 1024 | lm loss: 2.133658E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.699 | TFLOPs: 40.62 | +63: iteration 5000/ 5494 | consumed samples: 5120000 | consumed tokens: 10485760000 | elapsed time per iteration (s): 5.63 | learning rate: 2.364E-05 | global batch size: 1024 | lm loss: 2.165443E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.979 | TFLOPs: 40.68 | +63: ------------------------------------------------------------------------------------------ +63: valid loss at iteration 5000 | lm loss value: 2.086409E+00 | lm loss PPL: 8.055935E+00 | +63: ------------------------------------------------------------------------------------------ + 0: saving checkpoint at iteration 5000 to checkpoints_8b7beta + 0: [2022-12-02 02:25:30,521] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! + 0: [2022-12-02 02:25:30,567] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_01-model_00-model_states.pt... + 0: [2022-12-02 02:25:30,567] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_01-model_01-model_states.pt... +32: [2022-12-02 02:25:30,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_24-model_00-model_states.pt... +32: [2022-12-02 02:25:30,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_24-model_01-model_states.pt... +32: [2022-12-02 02:25:31,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_24-model_01-model_states.pt. +32: [2022-12-02 02:25:31,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_24-model_00-model_states.pt. + 0: [2022-12-02 02:25:31,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_01-model_01-model_states.pt. + 0: [2022-12-02 02:25:31,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_01-model_00-model_states.pt. +32: [2022-12-02 02:25:31,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_25-model_00-model_states.pt... +32: [2022-12-02 02:25:31,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_25-model_01-model_states.pt... + 0: [2022-12-02 02:25:31,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_03-model_01-model_states.pt... + 0: [2022-12-02 02:25:31,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_03-model_00-model_states.pt... +32: [2022-12-02 02:25:31,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_25-model_01-model_states.pt. +32: [2022-12-02 02:25:31,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_26-model_01-model_states.pt... + 0: [2022-12-02 02:25:31,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_03-model_01-model_states.pt. + 0: [2022-12-02 02:25:31,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_03-model_00-model_states.pt. + 0: [2022-12-02 02:25:31,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_04-model_01-model_states.pt... + 0: [2022-12-02 02:25:31,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_04-model_00-model_states.pt... +32: [2022-12-02 02:25:31,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_25-model_00-model_states.pt. +32: [2022-12-02 02:25:31,344] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_26-model_00-model_states.pt... +32: [2022-12-02 02:25:31,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_26-model_01-model_states.pt. +32: [2022-12-02 02:25:31,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_27-model_01-model_states.pt... + 0: [2022-12-02 02:25:31,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_04-model_00-model_states.pt. + 0: [2022-12-02 02:25:31,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_04-model_01-model_states.pt. + 0: [2022-12-02 02:25:31,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_05-model_00-model_states.pt... + 0: [2022-12-02 02:25:31,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_05-model_01-model_states.pt... +32: [2022-12-02 02:25:31,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_26-model_00-model_states.pt. +32: [2022-12-02 02:25:31,623] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_27-model_00-model_states.pt... +32: [2022-12-02 02:25:31,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_27-model_00-model_states.pt. +32: [2022-12-02 02:25:31,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_28-model_00-model_states.pt... + 0: [2022-12-02 02:25:31,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_05-model_01-model_states.pt. + 0: [2022-12-02 02:25:31,877] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_06-model_01-model_states.pt... +32: [2022-12-02 02:25:31,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_27-model_01-model_states.pt. +32: [2022-12-02 02:25:31,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_28-model_01-model_states.pt... + 0: [2022-12-02 02:25:31,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_05-model_00-model_states.pt. + 0: [2022-12-02 02:25:31,887] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_06-model_00-model_states.pt... +32: [2022-12-02 02:25:32,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_28-model_00-model_states.pt. + 0: [2022-12-02 02:25:32,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_06-model_01-model_states.pt. + 0: [2022-12-02 02:25:32,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_07-model_01-model_states.pt... +32: [2022-12-02 02:25:32,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_29-model_00-model_states.pt... + 0: [2022-12-02 02:25:32,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_06-model_00-model_states.pt. + 0: [2022-12-02 02:25:32,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_07-model_00-model_states.pt... +32: [2022-12-02 02:25:32,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_28-model_01-model_states.pt. +32: [2022-12-02 02:25:32,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_29-model_01-model_states.pt... +32: [2022-12-02 02:25:32,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_29-model_00-model_states.pt. +32: [2022-12-02 02:25:32,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_30-model_00-model_states.pt... + 0: [2022-12-02 02:25:32,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_07-model_00-model_states.pt. + 0: [2022-12-02 02:25:32,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_08-model_00-model_states.pt... + 0: [2022-12-02 02:25:32,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_07-model_01-model_states.pt. + 0: [2022-12-02 02:25:32,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_08-model_01-model_states.pt... +32: [2022-12-02 02:25:32,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_29-model_01-model_states.pt. +32: [2022-12-02 02:25:32,407] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_30-model_01-model_states.pt... +32: [2022-12-02 02:25:32,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_30-model_00-model_states.pt. +32: [2022-12-02 02:25:32,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_31-model_00-model_states.pt... +32: [2022-12-02 02:25:32,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_30-model_01-model_states.pt. +32: [2022-12-02 02:25:32,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_31-model_01-model_states.pt... + 0: [2022-12-02 02:25:32,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_08-model_00-model_states.pt. + 0: [2022-12-02 02:25:32,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_09-model_00-model_states.pt... + 0: [2022-12-02 02:25:32,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_08-model_01-model_states.pt. + 0: [2022-12-02 02:25:32,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_09-model_01-model_states.pt... +32: [2022-12-02 02:25:32,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_31-model_00-model_states.pt. +32: [2022-12-02 02:25:32,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_32-model_00-model_states.pt... +32: [2022-12-02 02:25:32,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_31-model_01-model_states.pt. +32: [2022-12-02 02:25:32,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_32-model_01-model_states.pt... + 0: [2022-12-02 02:25:32,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_09-model_01-model_states.pt. + 0: [2022-12-02 02:25:32,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_10-model_01-model_states.pt... + 0: [2022-12-02 02:25:32,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_09-model_00-model_states.pt. + 0: [2022-12-02 02:25:32,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_10-model_00-model_states.pt... +32: [2022-12-02 02:25:33,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_32-model_00-model_states.pt. +32: [2022-12-02 02:25:33,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_33-model_00-model_states.pt... +32: [2022-12-02 02:25:33,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_32-model_01-model_states.pt. +32: [2022-12-02 02:25:33,202] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_33-model_01-model_states.pt... + 0: [2022-12-02 02:25:33,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_10-model_00-model_states.pt. + 0: [2022-12-02 02:25:33,210] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_11-model_00-model_states.pt... + 0: [2022-12-02 02:25:33,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_10-model_01-model_states.pt. + 0: [2022-12-02 02:25:33,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_11-model_01-model_states.pt... +32: [2022-12-02 02:25:33,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_33-model_00-model_states.pt. +32: [2022-12-02 02:25:33,457] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_34-model_00-model_states.pt... +32: [2022-12-02 02:25:33,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_33-model_01-model_states.pt. +32: [2022-12-02 02:25:33,460] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_34-model_01-model_states.pt... + 0: [2022-12-02 02:25:33,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_11-model_00-model_states.pt. + 0: [2022-12-02 02:25:33,474] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_12-model_00-model_states.pt... + 0: [2022-12-02 02:25:33,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_11-model_01-model_states.pt. + 0: [2022-12-02 02:25:33,475] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_12-model_01-model_states.pt... +32: [2022-12-02 02:25:33,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_34-model_00-model_states.pt. +32: [2022-12-02 02:25:33,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_34-model_01-model_states.pt. +32: [2022-12-02 02:25:33,723] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_35-model_00-model_states.pt... +32: [2022-12-02 02:25:33,723] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_35-model_01-model_states.pt... + 0: [2022-12-02 02:25:33,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_12-model_00-model_states.pt. + 0: [2022-12-02 02:25:33,728] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_13-model_00-model_states.pt... + 0: [2022-12-02 02:25:33,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_12-model_01-model_states.pt. + 0: [2022-12-02 02:25:33,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_13-model_01-model_states.pt... + 0: [2022-12-02 02:25:33,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_13-model_00-model_states.pt. + 0: [2022-12-02 02:25:33,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_14-model_00-model_states.pt... + 0: [2022-12-02 02:25:33,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_13-model_01-model_states.pt. + 0: [2022-12-02 02:25:33,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_14-model_01-model_states.pt... +32: [2022-12-02 02:25:33,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_35-model_00-model_states.pt. +32: [2022-12-02 02:25:33,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_36-model_00-model_states.pt... +32: [2022-12-02 02:25:33,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_35-model_01-model_states.pt. +32: [2022-12-02 02:25:33,991] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_36-model_01-model_states.pt... +32: [2022-12-02 02:25:34,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_36-model_00-model_states.pt. +32: [2022-12-02 02:25:34,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_37-model_00-model_states.pt... +32: [2022-12-02 02:25:34,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_36-model_01-model_states.pt. +32: [2022-12-02 02:25:34,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_37-model_01-model_states.pt... + 0: [2022-12-02 02:25:34,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_14-model_00-model_states.pt. + 0: [2022-12-02 02:25:34,244] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_15-model_00-model_states.pt... + 0: [2022-12-02 02:25:34,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_14-model_01-model_states.pt. + 0: [2022-12-02 02:25:34,252] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_15-model_01-model_states.pt... +32: [2022-12-02 02:25:34,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_37-model_00-model_states.pt. +32: [2022-12-02 02:25:34,484] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_38-model_00-model_states.pt... +32: [2022-12-02 02:25:34,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_37-model_01-model_states.pt. +32: [2022-12-02 02:25:34,497] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_38-model_01-model_states.pt... + 0: [2022-12-02 02:25:34,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_15-model_00-model_states.pt. + 0: [2022-12-02 02:25:34,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_16-model_00-model_states.pt... + 0: [2022-12-02 02:25:34,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_15-model_01-model_states.pt. + 0: [2022-12-02 02:25:34,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_16-model_01-model_states.pt... +32: [2022-12-02 02:25:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_38-model_01-model_states.pt. +32: [2022-12-02 02:25:34,751] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_39-model_01-model_states.pt... +32: [2022-12-02 02:25:34,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_38-model_00-model_states.pt. +32: [2022-12-02 02:25:34,762] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_39-model_00-model_states.pt... + 0: [2022-12-02 02:25:34,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_16-model_01-model_states.pt. + 0: [2022-12-02 02:25:34,768] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_17-model_01-model_states.pt... + 0: [2022-12-02 02:25:34,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_16-model_00-model_states.pt. + 0: [2022-12-02 02:25:34,782] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_17-model_00-model_states.pt... +32: [2022-12-02 02:25:35,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_39-model_00-model_states.pt. +32: [2022-12-02 02:25:35,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_40-model_00-model_states.pt... +32: [2022-12-02 02:25:35,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_39-model_01-model_states.pt. +32: [2022-12-02 02:25:35,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_40-model_01-model_states.pt... + 0: [2022-12-02 02:25:35,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_17-model_01-model_states.pt. + 0: [2022-12-02 02:25:35,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_18-model_01-model_states.pt... + 0: [2022-12-02 02:25:35,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_17-model_00-model_states.pt. + 0: [2022-12-02 02:25:35,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_18-model_00-model_states.pt... +32: [2022-12-02 02:25:35,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_40-model_00-model_states.pt. +32: [2022-12-02 02:25:35,286] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_41-model_00-model_states.pt... + 0: [2022-12-02 02:25:35,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_18-model_01-model_states.pt. + 0: [2022-12-02 02:25:35,292] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_19-model_01-model_states.pt... + 0: [2022-12-02 02:25:35,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_18-model_00-model_states.pt. + 0: [2022-12-02 02:25:35,292] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_19-model_00-model_states.pt... +32: [2022-12-02 02:25:35,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_40-model_01-model_states.pt. +32: [2022-12-02 02:25:35,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_41-model_01-model_states.pt... +32: [2022-12-02 02:25:35,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_41-model_00-model_states.pt. +32: [2022-12-02 02:25:35,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_42-model_00-model_states.pt... +32: [2022-12-02 02:25:35,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_41-model_01-model_states.pt. +32: [2022-12-02 02:25:35,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_42-model_01-model_states.pt... + 0: [2022-12-02 02:25:35,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_19-model_00-model_states.pt. + 0: [2022-12-02 02:25:35,550] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_20-model_00-model_states.pt... + 0: [2022-12-02 02:25:35,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_19-model_01-model_states.pt. + 0: [2022-12-02 02:25:35,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_20-model_01-model_states.pt... +32: [2022-12-02 02:25:35,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_42-model_00-model_states.pt. +32: [2022-12-02 02:25:35,788] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_43-model_00-model_states.pt... +32: [2022-12-02 02:25:35,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_42-model_01-model_states.pt. +32: [2022-12-02 02:25:35,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_43-model_01-model_states.pt... + 0: [2022-12-02 02:25:35,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_20-model_01-model_states.pt. + 0: [2022-12-02 02:25:35,801] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_21-model_01-model_states.pt... + 0: [2022-12-02 02:25:35,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_20-model_00-model_states.pt. + 0: [2022-12-02 02:25:35,820] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_21-model_00-model_states.pt... +32: [2022-12-02 02:25:36,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_43-model_00-model_states.pt. +32: [2022-12-02 02:25:36,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_44-model_00-model_states.pt... + 0: [2022-12-02 02:25:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_21-model_01-model_states.pt. + 0: [2022-12-02 02:25:36,060] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_22-model_01-model_states.pt... +32: [2022-12-02 02:25:36,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_43-model_01-model_states.pt. +32: [2022-12-02 02:25:36,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_44-model_01-model_states.pt... + 0: [2022-12-02 02:25:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_21-model_00-model_states.pt. + 0: [2022-12-02 02:25:36,067] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_22-model_00-model_states.pt... +32: [2022-12-02 02:25:36,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_44-model_01-model_states.pt. +32: [2022-12-02 02:25:36,304] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_46-model_01-model_states.pt... +32: [2022-12-02 02:25:36,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_44-model_00-model_states.pt. +32: [2022-12-02 02:25:36,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_46-model_00-model_states.pt... +32: [2022-12-02 02:25:36,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_46-model_01-model_states.pt. +32: [2022-12-02 02:25:36,315] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/mp_rank_03_model_states.pt... +32: [2022-12-02 02:25:36,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_46-model_00-model_states.pt. + 0: [2022-12-02 02:25:36,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_22-model_00-model_states.pt. +32: [2022-12-02 02:25:36,322] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/mp_rank_02_model_states.pt... + 0: [2022-12-02 02:25:36,322] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_23-model_00-model_states.pt... +32: [2022-12-02 02:25:36,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/mp_rank_03_model_states.pt. +32: [2022-12-02 02:25:36,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/mp_rank_02_model_states.pt. + 0: [2022-12-02 02:25:36,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_22-model_01-model_states.pt. + 0: [2022-12-02 02:25:36,332] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/layer_23-model_01-model_states.pt... + 0: [2022-12-02 02:25:36,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_23-model_00-model_states.pt. + 0: [2022-12-02 02:25:36,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/layer_23-model_01-model_states.pt. + 0: [2022-12-02 02:25:36,580] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7beta/global_step5000/mp_rank_01_model_states.pt + 0: [2022-12-02 02:25:36,580] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7beta/global_step5000/mp_rank_00_model_states.pt + 0: [2022-12-02 02:25:36,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/mp_rank_01_model_states.pt... + 0: [2022-12-02 02:25:36,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/mp_rank_00_model_states.pt... + 0: [2022-12-02 02:25:36,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/mp_rank_01_model_states.pt. + 0: [2022-12-02 02:25:36,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/mp_rank_00_model_states.pt. +32: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... +63: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... +63: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... +63: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... +63: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... +32: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... +32: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... +48: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... +48: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... +48: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... +48: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... +54: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... +54: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... +54: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... +54: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... +56: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... +56: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... +56: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... +56: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... +62: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... +62: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... +62: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... +62: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... +60: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... +60: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... +60: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... +51: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... +51: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... +51: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... +51: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... +39: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... +39: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... +39: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... +39: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... +57: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... +57: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... +57: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... +57: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... +43: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... +43: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... +38: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... +38: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... +38: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... +38: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... +36: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... +36: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... +36: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... +36: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... +42: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... +42: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... +42: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... +42: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... +58: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... +58: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... +58: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... +58: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... +44: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... +44: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... +44: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... +44: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... +34: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... +34: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... +34: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... +34: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... +32: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... +50: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... +50: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... +50: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... +50: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... +46: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... +46: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... +46: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... +46: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... +49: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... +49: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... +49: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... +49: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... +41: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... +41: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... +41: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... +41: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... +55: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... +55: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... +55: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... +55: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... +37: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... +37: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... +37: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... +37: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... +33: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... +33: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... +33: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... +33: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... +52: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... +52: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... +52: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... +52: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... +61: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... +61: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... +61: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... +61: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... +45: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... +45: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... +45: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... +45: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... +47: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... +47: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... +47: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... +59: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... +59: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... +59: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... +59: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... +35: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... +35: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... +35: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... +35: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... +48: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... +53: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... +53: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... +53: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... +53: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... +40: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... +40: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... + 0: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +54: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... +56: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... +56: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... +62: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... +62: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... +62: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... + 4: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +60: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... +51: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... +51: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... +51: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... +51: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... +63: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... +63: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... +63: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... +63: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... +39: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... + 7: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... + 7: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +57: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... +57: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... +43: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... +43: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... +38: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... +38: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... +38: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... +36: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... +42: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... +42: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... +42: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... +58: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... +58: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... +44: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... +44: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... +34: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... +34: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... +34: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... +34: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... +32: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... +50: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... +50: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... +50: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... +46: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... +26: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... +49: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... +41: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... +41: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... +41: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... +41: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... +55: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... +37: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... +21: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... +33: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... +33: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... +52: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... +52: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... +31: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... +61: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... +61: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... +45: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... +45: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... +47: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... +47: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... +59: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... +35: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... +48: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... +53: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... +53: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... +40: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... +40: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... +40: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... +16: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... + 0: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +54: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... +56: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... +62: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... + 4: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... +60: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... +60: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... +60: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... +60: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... +17: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... +17: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... + 1: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +39: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... +39: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... + 7: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +11: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +11: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +57: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... +57: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... +27: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... +27: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... +43: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... +43: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... +25: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... +13: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +13: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +14: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +14: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +38: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... +22: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... +22: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... +22: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... +36: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... +24: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... +24: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... +42: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... +58: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... +44: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... +44: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... +28: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... +28: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... +28: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... +28: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... +28: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... +32: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... +50: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... +46: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... +26: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... +26: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... +26: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... +49: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... +49: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... +49: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... + 5: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... + 5: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... + 5: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... + 5: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +20: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... +20: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... +55: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... +10: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +10: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... +10: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +10: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +37: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... +37: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... +37: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... + 8: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... + 8: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +18: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... +18: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... +21: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... +33: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... +52: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... +52: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... + 6: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... + 6: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... + 6: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +31: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... +31: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... +61: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... +15: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +45: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... +47: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... +47: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... + 3: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... + 9: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... + 9: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +59: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... +12: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +12: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... + 2: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... + 2: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... + 2: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +29: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... +35: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... +35: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... +19: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... +48: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... +53: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... +53: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... +40: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... +40: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... +40: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... +30: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... +23: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... +23: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... +23: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... +16: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... +16: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... +16: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... + 0: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +54: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... +54: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... +56: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... + 4: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +17: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... +17: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... + 1: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... + 1: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +39: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... + 7: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... + 7: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... + 7: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +11: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... +27: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... +43: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... +25: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... +25: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... +25: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... +13: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +14: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +14: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... +14: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +22: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... +22: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... +36: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... +24: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... +24: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... +58: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... +28: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... +32: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... +32: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... +46: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... +46: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... +26: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... +26: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... + 5: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... + 5: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... +20: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... +20: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... +55: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... +10: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... +10: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... + 8: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... + 8: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +18: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... +21: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... +33: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... + 6: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... + 6: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +31: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... +61: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... +15: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +45: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... +47: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... + 3: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... + 9: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... + 9: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +59: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... +59: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... +12: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +12: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... + 2: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +29: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... +29: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... +29: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... +35: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... +19: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... +48: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... +30: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... +30: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... +30: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... +23: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... +23: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... +16: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... +16: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... + 0: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... + 4: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +17: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... +17: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... + 1: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... + 1: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... + 7: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... +11: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +27: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... +27: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... +27: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... +43: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... +25: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... +13: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... +13: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... +14: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... +22: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... +36: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... +24: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... +28: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... +26: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... + 5: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... +20: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... +20: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... +55: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... +10: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... +10: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... + 8: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... +18: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... +21: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... + 6: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... +31: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... +31: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... +15: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +15: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +15: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... + 3: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... + 9: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... +12: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... + 2: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... + 2: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... +29: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... +29: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... +19: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... +30: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... +23: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... + 0: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... + 4: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... +17: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... + 1: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... + 7: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... +11: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... +27: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... +25: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... +13: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... +14: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... +22: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... +24: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... +28: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... +26: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... + 5: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... +20: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... + 8: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... + 8: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... +18: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... +18: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... +21: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... + 6: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... +31: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... +15: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... + 3: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... + 3: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... + 9: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... + 9: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... + 9: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... +12: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... + 2: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... +19: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... +19: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... +30: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... +23: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... +16: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... + 0: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... + 0: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... + 4: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... + 4: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... +17: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... + 1: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... +11: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... +27: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... +25: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... +13: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +14: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... +22: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... +24: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... +20: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... + 8: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... +18: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... +21: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... + 6: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... +31: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... +15: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... + 3: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... + 2: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... +29: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... +29: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... +19: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... +30: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... +23: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... +16: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... + 0: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... + 4: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... + 1: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... +11: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +25: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... +13: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... +24: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... +18: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... +21: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... +15: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... + 3: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... +12: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +11: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... +21: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... + 3: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... +12: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... +19: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... +30: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... +19: [2022-12-02 02:25:36,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... +32: [2022-12-02 02:25:37,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. +32: [2022-12-02 02:25:37,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt +32: [2022-12-02 02:25:37,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +32: [2022-12-02 02:25:37,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. +32: [2022-12-02 02:25:37,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt +32: [2022-12-02 02:25:37,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 0: [2022-12-02 02:25:37,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. + 0: [2022-12-02 02:25:37,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt + 0: [2022-12-02 02:25:37,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 0: [2022-12-02 02:25:37,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. + 0: [2022-12-02 02:25:37,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt + 0: [2022-12-02 02:25:37,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 0: [2022-12-02 02:25:37,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. + 0: [2022-12-02 02:25:37,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. + 0: [2022-12-02 02:25:37,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt + 0: [2022-12-02 02:25:37,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 0: [2022-12-02 02:25:37,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. + 0: [2022-12-02 02:25:37,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt + 0: [2022-12-02 02:25:37,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +32: [2022-12-02 02:25:37,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. +32: [2022-12-02 02:25:37,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt +32: [2022-12-02 02:25:37,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +32: [2022-12-02 02:25:37,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. +32: [2022-12-02 02:25:37,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt +32: [2022-12-02 02:25:37,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +32: [2022-12-02 02:25:37,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. +32: [2022-12-02 02:25:37,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt +32: [2022-12-02 02:25:37,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 0: [2022-12-02 02:25:37,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. + 0: [2022-12-02 02:25:37,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt + 0: [2022-12-02 02:25:37,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 0: [2022-12-02 02:25:37,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. + 0: [2022-12-02 02:25:37,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt + 0: [2022-12-02 02:25:37,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 0: [2022-12-02 02:25:37,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. + 0: [2022-12-02 02:25:37,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt + 0: [2022-12-02 02:25:37,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +32: [2022-12-02 02:25:37,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. +32: [2022-12-02 02:25:37,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt +32: [2022-12-02 02:25:37,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +32: [2022-12-02 02:25:37,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. +32: [2022-12-02 02:25:37,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt +32: [2022-12-02 02:25:37,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +32: [2022-12-02 02:25:37,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. +32: [2022-12-02 02:25:37,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt +32: [2022-12-02 02:25:37,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +39: [2022-12-02 02:25:37,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. +38: [2022-12-02 02:25:37,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. +38: [2022-12-02 02:25:37,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. +38: [2022-12-02 02:25:37,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt +38: [2022-12-02 02:25:37,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt +38: [2022-12-02 02:25:37,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +38: [2022-12-02 02:25:37,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 3: [2022-12-02 02:25:37,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. + 3: [2022-12-02 02:25:37,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt + 3: [2022-12-02 02:25:37,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +13: [2022-12-02 02:25:37,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +13: [2022-12-02 02:25:37,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +13: [2022-12-02 02:25:37,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +38: [2022-12-02 02:25:37,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. +38: [2022-12-02 02:25:37,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt +38: [2022-12-02 02:25:37,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +39: [2022-12-02 02:25:37,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt +39: [2022-12-02 02:25:37,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +39: [2022-12-02 02:25:37,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. +39: [2022-12-02 02:25:37,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt +39: [2022-12-02 02:25:37,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +39: [2022-12-02 02:25:37,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. +39: [2022-12-02 02:25:37,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt +39: [2022-12-02 02:25:37,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +39: [2022-12-02 02:25:37,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. +39: [2022-12-02 02:25:37,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt +39: [2022-12-02 02:25:37,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +39: [2022-12-02 02:25:37,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. +39: [2022-12-02 02:25:37,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt +39: [2022-12-02 02:25:37,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +39: [2022-12-02 02:25:37,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. +39: [2022-12-02 02:25:37,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. +39: [2022-12-02 02:25:37,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt +39: [2022-12-02 02:25:37,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt +39: [2022-12-02 02:25:37,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +39: [2022-12-02 02:25:37,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 9: [2022-12-02 02:25:37,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. + 9: [2022-12-02 02:25:37,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt + 9: [2022-12-02 02:25:37,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +13: [2022-12-02 02:25:37,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +13: [2022-12-02 02:25:37,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +13: [2022-12-02 02:25:37,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 3: [2022-12-02 02:25:37,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +51: [2022-12-02 02:25:37,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. + 3: [2022-12-02 02:25:37,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt + 3: [2022-12-02 02:25:37,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +51: [2022-12-02 02:25:37,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. +51: [2022-12-02 02:25:37,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. +51: [2022-12-02 02:25:37,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt +51: [2022-12-02 02:25:37,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt +51: [2022-12-02 02:25:37,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +51: [2022-12-02 02:25:37,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt +51: [2022-12-02 02:25:37,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +51: [2022-12-02 02:25:37,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +13: [2022-12-02 02:25:37,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. +13: [2022-12-02 02:25:37,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt +13: [2022-12-02 02:25:37,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +57: [2022-12-02 02:25:37,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. +57: [2022-12-02 02:25:37,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. +57: [2022-12-02 02:25:37,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. +57: [2022-12-02 02:25:37,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt +57: [2022-12-02 02:25:37,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt +57: [2022-12-02 02:25:37,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt +57: [2022-12-02 02:25:37,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +57: [2022-12-02 02:25:37,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +57: [2022-12-02 02:25:37,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 3: [2022-12-02 02:25:37,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. + 3: [2022-12-02 02:25:37,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt + 3: [2022-12-02 02:25:37,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 3: [2022-12-02 02:25:37,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. +51: [2022-12-02 02:25:37,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. + 3: [2022-12-02 02:25:37,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt + 3: [2022-12-02 02:25:37,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +51: [2022-12-02 02:25:37,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt +51: [2022-12-02 02:25:37,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +56: [2022-12-02 02:25:37,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. + 3: [2022-12-02 02:25:37,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. + 3: [2022-12-02 02:25:37,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt + 3: [2022-12-02 02:25:37,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +13: [2022-12-02 02:25:37,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. +13: [2022-12-02 02:25:37,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt +13: [2022-12-02 02:25:37,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +13: [2022-12-02 02:25:37,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +13: [2022-12-02 02:25:37,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +13: [2022-12-02 02:25:37,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +13: [2022-12-02 02:25:37,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +13: [2022-12-02 02:25:37,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +13: [2022-12-02 02:25:37,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. +25: [2022-12-02 02:25:37,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt +25: [2022-12-02 02:25:37,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +25: [2022-12-02 02:25:37,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +25: [2022-12-02 02:25:37,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +25: [2022-12-02 02:25:37,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. +25: [2022-12-02 02:25:37,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. +25: [2022-12-02 02:25:37,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt +25: [2022-12-02 02:25:37,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +13: [2022-12-02 02:25:37,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. +13: [2022-12-02 02:25:37,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt +13: [2022-12-02 02:25:37,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 3: [2022-12-02 02:25:37,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +13: [2022-12-02 02:25:37,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. + 3: [2022-12-02 02:25:37,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt + 3: [2022-12-02 02:25:37,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +13: [2022-12-02 02:25:37,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt +13: [2022-12-02 02:25:37,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 0: [2022-12-02 02:25:37,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt + 0: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. + 1: [2022-12-02 02:25:37,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. + 1: [2022-12-02 02:25:37,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. + 1: [2022-12-02 02:25:37,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. + 1: [2022-12-02 02:25:37,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. +38: [2022-12-02 02:25:37,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. +38: [2022-12-02 02:25:37,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt +38: [2022-12-02 02:25:37,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt +15: [2022-12-02 02:25:37,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt +15: [2022-12-02 02:25:37,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +57: [2022-12-02 02:25:37,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. +57: [2022-12-02 02:25:37,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt +57: [2022-12-02 02:25:37,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +57: [2022-12-02 02:25:37,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. +57: [2022-12-02 02:25:37,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt +57: [2022-12-02 02:25:37,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +57: [2022-12-02 02:25:37,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. +57: [2022-12-02 02:25:37,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt +57: [2022-12-02 02:25:37,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 1: [2022-12-02 02:25:37,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt + 1: [2022-12-02 02:25:37,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt + 1: [2022-12-02 02:25:37,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt + 1: [2022-12-02 02:25:37,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt + 1: [2022-12-02 02:25:37,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 1: [2022-12-02 02:25:37,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 1: [2022-12-02 02:25:37,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 1: [2022-12-02 02:25:37,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +19: [2022-12-02 02:25:37,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. +19: [2022-12-02 02:25:37,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. +19: [2022-12-02 02:25:37,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. +19: [2022-12-02 02:25:37,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt +19: [2022-12-02 02:25:37,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt +19: [2022-12-02 02:25:37,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt +19: [2022-12-02 02:25:37,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +19: [2022-12-02 02:25:37,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +19: [2022-12-02 02:25:37,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +11: [2022-12-02 02:25:37,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. +11: [2022-12-02 02:25:37,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt +11: [2022-12-02 02:25:37,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +11: [2022-12-02 02:25:37,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. +11: [2022-12-02 02:25:37,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt +11: [2022-12-02 02:25:37,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +11: [2022-12-02 02:25:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. +56: [2022-12-02 02:25:37,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt +56: [2022-12-02 02:25:37,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +56: [2022-12-02 02:25:37,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. +56: [2022-12-02 02:25:37,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt +56: [2022-12-02 02:25:37,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +56: [2022-12-02 02:25:37,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. +56: [2022-12-02 02:25:37,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt +56: [2022-12-02 02:25:37,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +56: [2022-12-02 02:25:37,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. +56: [2022-12-02 02:25:37,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt +56: [2022-12-02 02:25:37,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +11: [2022-12-02 02:25:37,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt +11: [2022-12-02 02:25:37,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +56: [2022-12-02 02:25:37,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. +56: [2022-12-02 02:25:37,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. +56: [2022-12-02 02:25:37,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. +56: [2022-12-02 02:25:37,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt +56: [2022-12-02 02:25:37,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt +56: [2022-12-02 02:25:37,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt +56: [2022-12-02 02:25:37,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +56: [2022-12-02 02:25:37,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +56: [2022-12-02 02:25:37,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +51: [2022-12-02 02:25:37,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. +51: [2022-12-02 02:25:37,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt +19: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. +51: [2022-12-02 02:25:37,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +19: [2022-12-02 02:25:37,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt +19: [2022-12-02 02:25:37,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +15: [2022-12-02 02:25:37,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +19: [2022-12-02 02:25:37,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. +19: [2022-12-02 02:25:37,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt +19: [2022-12-02 02:25:37,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. +19: [2022-12-02 02:25:37,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +19: [2022-12-02 02:25:37,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt +19: [2022-12-02 02:25:37,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +51: [2022-12-02 02:25:37,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. +51: [2022-12-02 02:25:37,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt +51: [2022-12-02 02:25:37,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +15: [2022-12-02 02:25:37,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +15: [2022-12-02 02:25:37,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +62: [2022-12-02 02:25:37,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. +62: [2022-12-02 02:25:37,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. +62: [2022-12-02 02:25:37,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt +62: [2022-12-02 02:25:37,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt +62: [2022-12-02 02:25:37,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. +62: [2022-12-02 02:25:37,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +62: [2022-12-02 02:25:37,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +62: [2022-12-02 02:25:37,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt +62: [2022-12-02 02:25:37,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +59: [2022-12-02 02:25:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. +59: [2022-12-02 02:25:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. +59: [2022-12-02 02:25:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt +59: [2022-12-02 02:25:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +59: [2022-12-02 02:25:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt +59: [2022-12-02 02:25:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt +59: [2022-12-02 02:25:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +62: [2022-12-02 02:25:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. +59: [2022-12-02 02:25:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +62: [2022-12-02 02:25:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. +62: [2022-12-02 02:25:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. +62: [2022-12-02 02:25:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt +62: [2022-12-02 02:25:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt +62: [2022-12-02 02:25:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +62: [2022-12-02 02:25:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt +62: [2022-12-02 02:25:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. +62: [2022-12-02 02:25:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +62: [2022-12-02 02:25:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +62: [2022-12-02 02:25:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt +62: [2022-12-02 02:25:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +62: [2022-12-02 02:25:37,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. +62: [2022-12-02 02:25:37,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt +62: [2022-12-02 02:25:37,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +38: [2022-12-02 02:25:37,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. +38: [2022-12-02 02:25:37,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt +38: [2022-12-02 02:25:37,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +38: [2022-12-02 02:25:37,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. +38: [2022-12-02 02:25:37,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt +38: [2022-12-02 02:25:37,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +24: [2022-12-02 02:25:37,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. +24: [2022-12-02 02:25:37,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. +24: [2022-12-02 02:25:37,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt +24: [2022-12-02 02:25:37,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. +24: [2022-12-02 02:25:37,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. +24: [2022-12-02 02:25:37,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +24: [2022-12-02 02:25:37,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt +24: [2022-12-02 02:25:37,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt +24: [2022-12-02 02:25:37,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt +24: [2022-12-02 02:25:37,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +24: [2022-12-02 02:25:37,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +24: [2022-12-02 02:25:37,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +11: [2022-12-02 02:25:37,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +11: [2022-12-02 02:25:37,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +11: [2022-12-02 02:25:37,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 9: [2022-12-02 02:25:37,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. + 9: [2022-12-02 02:25:37,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt + 9: [2022-12-02 02:25:37,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 9: [2022-12-02 02:25:37,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. + 9: [2022-12-02 02:25:37,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt + 9: [2022-12-02 02:25:37,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 9: [2022-12-02 02:25:37,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. + 9: [2022-12-02 02:25:37,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt + 9: [2022-12-02 02:25:37,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +31: [2022-12-02 02:25:37,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. +31: [2022-12-02 02:25:37,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt +31: [2022-12-02 02:25:37,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +31: [2022-12-02 02:25:37,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. +31: [2022-12-02 02:25:37,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt +31: [2022-12-02 02:25:37,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +31: [2022-12-02 02:25:37,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. +31: [2022-12-02 02:25:37,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt +31: [2022-12-02 02:25:37,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +54: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. +54: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. +54: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. +54: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. +54: [2022-12-02 02:25:37,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt +54: [2022-12-02 02:25:37,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt +54: [2022-12-02 02:25:37,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt +54: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +54: [2022-12-02 02:25:37,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt +54: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +54: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +54: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +61: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. +31: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. +61: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. +61: [2022-12-02 02:25:37,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt +61: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. +61: [2022-12-02 02:25:37,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +61: [2022-12-02 02:25:37,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt +61: [2022-12-02 02:25:37,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt +61: [2022-12-02 02:25:37,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +61: [2022-12-02 02:25:37,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +31: [2022-12-02 02:25:37,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt +31: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. +31: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +31: [2022-12-02 02:25:37,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt +31: [2022-12-02 02:25:37,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +47: [2022-12-02 02:25:37,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. +47: [2022-12-02 02:25:37,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. +47: [2022-12-02 02:25:37,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. +47: [2022-12-02 02:25:37,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt +47: [2022-12-02 02:25:37,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt +47: [2022-12-02 02:25:37,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt +47: [2022-12-02 02:25:37,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +47: [2022-12-02 02:25:37,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +47: [2022-12-02 02:25:37,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +61: [2022-12-02 02:25:37,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. +47: [2022-12-02 02:25:37,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. +47: [2022-12-02 02:25:37,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. +47: [2022-12-02 02:25:37,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt +47: [2022-12-02 02:25:37,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt +47: [2022-12-02 02:25:37,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +47: [2022-12-02 02:25:37,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +46: [2022-12-02 02:25:37,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. +46: [2022-12-02 02:25:37,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt +46: [2022-12-02 02:25:37,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +46: [2022-12-02 02:25:37,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. +46: [2022-12-02 02:25:37,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt +46: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +61: [2022-12-02 02:25:37,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. +61: [2022-12-02 02:25:37,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. +61: [2022-12-02 02:25:37,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt +61: [2022-12-02 02:25:37,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt +61: [2022-12-02 02:25:37,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt +61: [2022-12-02 02:25:37,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +61: [2022-12-02 02:25:37,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +61: [2022-12-02 02:25:37,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +46: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. +46: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt +46: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +46: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. +46: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt +46: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +40: [2022-12-02 02:25:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. +40: [2022-12-02 02:25:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. +40: [2022-12-02 02:25:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. +40: [2022-12-02 02:25:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. +40: [2022-12-02 02:25:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt + 8: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt + 8: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt + 8: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt + 8: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt + 8: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +40: [2022-12-02 02:25:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. +40: [2022-12-02 02:25:37,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt +40: [2022-12-02 02:25:37,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt +40: [2022-12-02 02:25:37,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt +40: [2022-12-02 02:25:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt +40: [2022-12-02 02:25:37,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt + 8: [2022-12-02 02:25:37,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +40: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt +40: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt + 8: [2022-12-02 02:25:37,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt +40: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +40: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. +16: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. +16: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt +16: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +16: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt +16: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +16: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt +16: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt +16: [2022-12-02 02:25:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +16: [2022-12-02 02:25:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +54: [2022-12-02 02:25:37,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. +54: [2022-12-02 02:25:37,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt +54: [2022-12-02 02:25:37,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +54: [2022-12-02 02:25:37,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. +54: [2022-12-02 02:25:37,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. +54: [2022-12-02 02:25:37,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt +54: [2022-12-02 02:25:37,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt +54: [2022-12-02 02:25:37,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +54: [2022-12-02 02:25:37,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. +42: [2022-12-02 02:25:37,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt +42: [2022-12-02 02:25:37,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt +42: [2022-12-02 02:25:37,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt +42: [2022-12-02 02:25:37,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. +42: [2022-12-02 02:25:37,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt +42: [2022-12-02 02:25:37,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +24: [2022-12-02 02:25:37,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. +24: [2022-12-02 02:25:37,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. +24: [2022-12-02 02:25:37,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 1: [2022-12-02 02:25:37,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. + 1: [2022-12-02 02:25:37,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt + 1: [2022-12-02 02:25:37,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +61: [2022-12-02 02:25:37,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. +61: [2022-12-02 02:25:37,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt +61: [2022-12-02 02:25:37,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +12: [2022-12-02 02:25:37,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt +12: [2022-12-02 02:25:37,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +12: [2022-12-02 02:25:37,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +12: [2022-12-02 02:25:37,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt +12: [2022-12-02 02:25:37,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt +12: [2022-12-02 02:25:37,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +12: [2022-12-02 02:25:37,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +12: [2022-12-02 02:25:37,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. +49: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt +49: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt +49: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt +49: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt +49: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +49: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt +49: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +49: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +61: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. +61: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt +61: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +21: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt +21: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. +21: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt +21: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt +21: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt +21: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +21: [2022-12-02 02:25:37,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt +21: [2022-12-02 02:25:37,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +21: [2022-12-02 02:25:37,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. +21: [2022-12-02 02:25:37,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt +21: [2022-12-02 02:25:37,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +25: [2022-12-02 02:25:37,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. +25: [2022-12-02 02:25:37,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt +25: [2022-12-02 02:25:37,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. +36: [2022-12-02 02:25:37,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. +36: [2022-12-02 02:25:37,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt +36: [2022-12-02 02:25:37,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt +36: [2022-12-02 02:25:37,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt +36: [2022-12-02 02:25:37,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. +36: [2022-12-02 02:25:37,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. +36: [2022-12-02 02:25:37,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +36: [2022-12-02 02:25:37,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt +36: [2022-12-02 02:25:37,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +36: [2022-12-02 02:25:37,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +27: [2022-12-02 02:25:37,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. +27: [2022-12-02 02:25:37,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. +27: [2022-12-02 02:25:37,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. +27: [2022-12-02 02:25:37,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. +27: [2022-12-02 02:25:37,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt +27: [2022-12-02 02:25:37,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. +27: [2022-12-02 02:25:37,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. +27: [2022-12-02 02:25:37,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. +27: [2022-12-02 02:25:37,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt +27: [2022-12-02 02:25:37,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt +52: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt +52: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt +27: [2022-12-02 02:25:37,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt +27: [2022-12-02 02:25:37,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +27: [2022-12-02 02:25:37,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt +27: [2022-12-02 02:25:37,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +27: [2022-12-02 02:25:37,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt +27: [2022-12-02 02:25:37,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt +52: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt +52: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt +27: [2022-12-02 02:25:37,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +27: [2022-12-02 02:25:37,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. +48: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt +48: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. +48: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt +48: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt +43: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. +43: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. +48: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +48: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +48: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +43: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. +43: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt +43: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt +43: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +43: [2022-12-02 02:25:37,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt +43: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +43: [2022-12-02 02:25:37,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. +14: [2022-12-02 02:25:37,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +14: [2022-12-02 02:25:37,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +14: [2022-12-02 02:25:37,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt +14: [2022-12-02 02:25:37,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt +14: [2022-12-02 02:25:37,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +14: [2022-12-02 02:25:37,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt +14: [2022-12-02 02:25:37,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +14: [2022-12-02 02:25:37,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +57: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. +57: [2022-12-02 02:25:37,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt +57: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. +37: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. +37: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. +37: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. +37: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. +37: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. +58: [2022-12-02 02:25:37,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +58: [2022-12-02 02:25:37,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt +58: [2022-12-02 02:25:37,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt +58: [2022-12-02 02:25:37,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt +58: [2022-12-02 02:25:37,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +58: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +24: [2022-12-02 02:25:37,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. +24: [2022-12-02 02:25:37,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt +24: [2022-12-02 02:25:37,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +37: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. +37: [2022-12-02 02:25:37,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. +37: [2022-12-02 02:25:37,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt +37: [2022-12-02 02:25:37,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt +37: [2022-12-02 02:25:37,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt +37: [2022-12-02 02:25:37,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt +37: [2022-12-02 02:25:37,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt +37: [2022-12-02 02:25:37,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +37: [2022-12-02 02:25:37,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +37: [2022-12-02 02:25:37,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt +37: [2022-12-02 02:25:37,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt +37: [2022-12-02 02:25:37,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +37: [2022-12-02 02:25:37,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +37: [2022-12-02 02:25:37,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +37: [2022-12-02 02:25:37,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +37: [2022-12-02 02:25:37,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +31: [2022-12-02 02:25:37,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. +31: [2022-12-02 02:25:37,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt +31: [2022-12-02 02:25:37,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +43: [2022-12-02 02:25:37,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. + 1: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +43: [2022-12-02 02:25:37,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. +43: [2022-12-02 02:25:37,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt +43: [2022-12-02 02:25:37,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +43: [2022-12-02 02:25:37,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt +43: [2022-12-02 02:25:37,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +43: [2022-12-02 02:25:37,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. +43: [2022-12-02 02:25:37,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt +43: [2022-12-02 02:25:37,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +26: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. +26: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. +26: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. +26: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. +26: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. +26: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. +27: [2022-12-02 02:25:37,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. +27: [2022-12-02 02:25:37,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. +27: [2022-12-02 02:25:37,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt +27: [2022-12-02 02:25:37,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt +27: [2022-12-02 02:25:37,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +27: [2022-12-02 02:25:37,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 1: [2022-12-02 02:25:37,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt + 1: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 1: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. + 1: [2022-12-02 02:25:37,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt + 1: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +26: [2022-12-02 02:25:37,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. +26: [2022-12-02 02:25:37,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt +26: [2022-12-02 02:25:37,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt +26: [2022-12-02 02:25:37,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt +26: [2022-12-02 02:25:37,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt +26: [2022-12-02 02:25:37,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt +26: [2022-12-02 02:25:37,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt +26: [2022-12-02 02:25:37,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +26: [2022-12-02 02:25:37,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +26: [2022-12-02 02:25:37,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +26: [2022-12-02 02:25:37,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt +26: [2022-12-02 02:25:37,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +26: [2022-12-02 02:25:37,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +26: [2022-12-02 02:25:37,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +26: [2022-12-02 02:25:37,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +26: [2022-12-02 02:25:37,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. +26: [2022-12-02 02:25:37,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt +26: [2022-12-02 02:25:37,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +11: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. + 9: [2022-12-02 02:25:37,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. +11: [2022-12-02 02:25:37,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +11: [2022-12-02 02:25:37,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. +39: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. +39: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. + 9: [2022-12-02 02:25:37,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt +39: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. + 9: [2022-12-02 02:25:37,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt +45: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt +45: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt +45: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt +45: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt +45: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 5: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. + 5: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. + 5: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. + 5: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +55: [2022-12-02 02:25:37,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. +55: [2022-12-02 02:25:37,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. +55: [2022-12-02 02:25:37,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. +55: [2022-12-02 02:25:37,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt +55: [2022-12-02 02:25:37,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt +55: [2022-12-02 02:25:37,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. +55: [2022-12-02 02:25:37,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +55: [2022-12-02 02:25:37,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt +55: [2022-12-02 02:25:37,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +55: [2022-12-02 02:25:37,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +55: [2022-12-02 02:25:37,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt +55: [2022-12-02 02:25:37,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +55: [2022-12-02 02:25:37,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. +55: [2022-12-02 02:25:37,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt +55: [2022-12-02 02:25:37,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 5: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. + 5: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. + 5: [2022-12-02 02:25:37,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. + 5: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt + 5: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt + 5: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt + 5: [2022-12-02 02:25:37,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt + 5: [2022-12-02 02:25:37,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 5: [2022-12-02 02:25:37,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 5: [2022-12-02 02:25:37,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt + 5: [2022-12-02 02:25:37,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 5: [2022-12-02 02:25:37,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 5: [2022-12-02 02:25:37,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt + 5: [2022-12-02 02:25:37,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt + 5: [2022-12-02 02:25:37,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 5: [2022-12-02 02:25:37,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 5: [2022-12-02 02:25:37,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +55: [2022-12-02 02:25:37,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. +55: [2022-12-02 02:25:37,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt +55: [2022-12-02 02:25:37,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +55: [2022-12-02 02:25:37,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. +55: [2022-12-02 02:25:37,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt +55: [2022-12-02 02:25:37,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +22: [2022-12-02 02:25:37,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. +22: [2022-12-02 02:25:37,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. +22: [2022-12-02 02:25:37,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. +22: [2022-12-02 02:25:37,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt +22: [2022-12-02 02:25:37,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt +22: [2022-12-02 02:25:37,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt +22: [2022-12-02 02:25:37,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +22: [2022-12-02 02:25:37,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +22: [2022-12-02 02:25:37,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +55: [2022-12-02 02:25:37,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. +22: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. +22: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. +22: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. +22: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt +22: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt +22: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. +22: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt +22: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +22: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +22: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +22: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt +22: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +55: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt +55: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. +33: [2022-12-02 02:25:37,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt +33: [2022-12-02 02:25:37,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt +33: [2022-12-02 02:25:37,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. +33: [2022-12-02 02:25:37,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +33: [2022-12-02 02:25:37,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +33: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. +30: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt +30: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt +30: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt +30: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt +30: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt +30: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt +30: [2022-12-02 02:25:37,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +30: [2022-12-02 02:25:37,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +30: [2022-12-02 02:25:37,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. +30: [2022-12-02 02:25:37,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt +30: [2022-12-02 02:25:37,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. +41: [2022-12-02 02:25:37,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt +41: [2022-12-02 02:25:37,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt +41: [2022-12-02 02:25:37,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt +41: [2022-12-02 02:25:37,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt +41: [2022-12-02 02:25:37,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt +41: [2022-12-02 02:25:37,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt +41: [2022-12-02 02:25:37,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +41: [2022-12-02 02:25:37,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +41: [2022-12-02 02:25:37,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. +41: [2022-12-02 02:25:37,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt +41: [2022-12-02 02:25:37,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. +44: [2022-12-02 02:25:37,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. +44: [2022-12-02 02:25:37,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. +44: [2022-12-02 02:25:37,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt +44: [2022-12-02 02:25:37,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt +44: [2022-12-02 02:25:37,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt +44: [2022-12-02 02:25:37,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. +60: [2022-12-02 02:25:37,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. +60: [2022-12-02 02:25:37,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. +60: [2022-12-02 02:25:37,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. +60: [2022-12-02 02:25:37,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt +60: [2022-12-02 02:25:37,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +60: [2022-12-02 02:25:37,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt +60: [2022-12-02 02:25:37,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt +60: [2022-12-02 02:25:37,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +60: [2022-12-02 02:25:37,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +60: [2022-12-02 02:25:37,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. +60: [2022-12-02 02:25:37,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt +60: [2022-12-02 02:25:37,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +44: [2022-12-02 02:25:37,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt +44: [2022-12-02 02:25:37,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 9: [2022-12-02 02:25:37,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. + 9: [2022-12-02 02:25:37,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt + 9: [2022-12-02 02:25:37,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +47: [2022-12-02 02:25:37,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. +47: [2022-12-02 02:25:37,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt +47: [2022-12-02 02:25:37,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +47: [2022-12-02 02:25:37,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. +11: [2022-12-02 02:25:37,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. +11: [2022-12-02 02:25:37,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt +47: [2022-12-02 02:25:37,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt +47: [2022-12-02 02:25:37,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +11: [2022-12-02 02:25:37,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +34: [2022-12-02 02:25:37,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. +34: [2022-12-02 02:25:37,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. +34: [2022-12-02 02:25:37,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. +34: [2022-12-02 02:25:37,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. +34: [2022-12-02 02:25:37,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. +34: [2022-12-02 02:25:37,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. +34: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt +34: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt +34: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt +34: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt +34: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt +34: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. +34: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt +34: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +34: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +60: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. +45: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. +60: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt +60: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +45: [2022-12-02 02:25:37,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt +45: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +60: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. +60: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. +60: [2022-12-02 02:25:37,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt +60: [2022-12-02 02:25:37,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. +60: [2022-12-02 02:25:37,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +60: [2022-12-02 02:25:37,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt +60: [2022-12-02 02:25:37,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt +60: [2022-12-02 02:25:37,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +60: [2022-12-02 02:25:37,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. + 9: [2022-12-02 02:25:37,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. + 9: [2022-12-02 02:25:37,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt + 9: [2022-12-02 02:25:37,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. +63: [2022-12-02 02:25:37,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt +63: [2022-12-02 02:25:37,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt +63: [2022-12-02 02:25:37,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt +63: [2022-12-02 02:25:37,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt +63: [2022-12-02 02:25:37,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +63: [2022-12-02 02:25:37,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt +63: [2022-12-02 02:25:37,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +63: [2022-12-02 02:25:37,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. +10: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +10: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +10: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +10: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt +10: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +10: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +10: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +10: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. +17: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt +17: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt +17: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt +17: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt +17: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt +17: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +17: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +17: [2022-12-02 02:25:37,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +17: [2022-12-02 02:25:37,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 6: [2022-12-02 02:25:37,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. + 6: [2022-12-02 02:25:37,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. + 6: [2022-12-02 02:25:37,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. + 6: [2022-12-02 02:25:37,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. + 6: [2022-12-02 02:25:37,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. + 4: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. + 4: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. + 4: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. + 4: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. + 4: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. + 4: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. + 4: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. + 4: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. + 4: [2022-12-02 02:25:37,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt + 4: [2022-12-02 02:25:37,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt + 4: [2022-12-02 02:25:37,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt + 4: [2022-12-02 02:25:37,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt + 4: [2022-12-02 02:25:37,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt + 4: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 4: [2022-12-02 02:25:37,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt + 4: [2022-12-02 02:25:37,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt + 4: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 4: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 4: [2022-12-02 02:25:37,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt + 4: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 4: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 4: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 4: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 4: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. + 6: [2022-12-02 02:25:37,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. + 6: [2022-12-02 02:25:37,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. + 6: [2022-12-02 02:25:37,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. + 6: [2022-12-02 02:25:37,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt + 6: [2022-12-02 02:25:37,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt + 6: [2022-12-02 02:25:37,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt + 6: [2022-12-02 02:25:37,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt + 6: [2022-12-02 02:25:37,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt + 6: [2022-12-02 02:25:37,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt + 6: [2022-12-02 02:25:37,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt + 6: [2022-12-02 02:25:37,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt + 6: [2022-12-02 02:25:37,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 6: [2022-12-02 02:25:37,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 6: [2022-12-02 02:25:37,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 6: [2022-12-02 02:25:37,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 6: [2022-12-02 02:25:37,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 6: [2022-12-02 02:25:37,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 6: [2022-12-02 02:25:37,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 6: [2022-12-02 02:25:37,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. +23: [2022-12-02 02:25:37,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt +23: [2022-12-02 02:25:37,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt +23: [2022-12-02 02:25:37,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt +23: [2022-12-02 02:25:37,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt +23: [2022-12-02 02:25:37,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt +23: [2022-12-02 02:25:37,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt +23: [2022-12-02 02:25:37,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +23: [2022-12-02 02:25:37,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +23: [2022-12-02 02:25:37,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. +35: [2022-12-02 02:25:37,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt +35: [2022-12-02 02:25:37,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt +35: [2022-12-02 02:25:37,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt +35: [2022-12-02 02:25:37,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt +35: [2022-12-02 02:25:37,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt +35: [2022-12-02 02:25:37,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt +35: [2022-12-02 02:25:37,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +35: [2022-12-02 02:25:37,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +35: [2022-12-02 02:25:37,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +35: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +35: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +35: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +35: [2022-12-02 02:25:37,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +33: [2022-12-02 02:25:37,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. +33: [2022-12-02 02:25:37,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt +33: [2022-12-02 02:25:37,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +33: [2022-12-02 02:25:37,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. +33: [2022-12-02 02:25:37,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt +33: [2022-12-02 02:25:37,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. +50: [2022-12-02 02:25:37,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. +50: [2022-12-02 02:25:37,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt +50: [2022-12-02 02:25:37,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +50: [2022-12-02 02:25:37,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt +50: [2022-12-02 02:25:37,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +50: [2022-12-02 02:25:37,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +50: [2022-12-02 02:25:37,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. + 7: [2022-12-02 02:25:37,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt + 7: [2022-12-02 02:25:37,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt + 7: [2022-12-02 02:25:37,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt + 7: [2022-12-02 02:25:37,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 7: [2022-12-02 02:25:37,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 7: [2022-12-02 02:25:37,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 3: [2022-12-02 02:25:37,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. + 3: [2022-12-02 02:25:37,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt + 3: [2022-12-02 02:25:37,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +57: [2022-12-02 02:25:37,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. + 7: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. +57: [2022-12-02 02:25:37,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt +57: [2022-12-02 02:25:37,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 7: [2022-12-02 02:25:37,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt + 7: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +18: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. +18: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. +18: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. +18: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. +18: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. +18: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. +18: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. +18: [2022-12-02 02:25:37,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt +18: [2022-12-02 02:25:37,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt +18: [2022-12-02 02:25:37,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt +18: [2022-12-02 02:25:37,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt +18: [2022-12-02 02:25:37,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +18: [2022-12-02 02:25:37,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt +18: [2022-12-02 02:25:37,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +18: [2022-12-02 02:25:37,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt +18: [2022-12-02 02:25:37,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt +18: [2022-12-02 02:25:37,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +18: [2022-12-02 02:25:37,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +18: [2022-12-02 02:25:37,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +18: [2022-12-02 02:25:37,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +18: [2022-12-02 02:25:37,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +50: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. +50: [2022-12-02 02:25:37,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt +50: [2022-12-02 02:25:37,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +50: [2022-12-02 02:25:37,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. +50: [2022-12-02 02:25:37,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt +50: [2022-12-02 02:25:37,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 7: [2022-12-02 02:25:37,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. + 7: [2022-12-02 02:25:37,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt + 7: [2022-12-02 02:25:37,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +31: [2022-12-02 02:25:37,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. +31: [2022-12-02 02:25:37,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt +31: [2022-12-02 02:25:37,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +42: [2022-12-02 02:25:37,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. +42: [2022-12-02 02:25:37,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt +42: [2022-12-02 02:25:37,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. +28: [2022-12-02 02:25:37,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt +28: [2022-12-02 02:25:37,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt +28: [2022-12-02 02:25:37,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. +28: [2022-12-02 02:25:37,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt +28: [2022-12-02 02:25:37,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt +28: [2022-12-02 02:25:37,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt +28: [2022-12-02 02:25:37,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +28: [2022-12-02 02:25:37,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +28: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +42: [2022-12-02 02:25:37,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. +42: [2022-12-02 02:25:37,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt +42: [2022-12-02 02:25:37,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +19: [2022-12-02 02:25:37,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. +19: [2022-12-02 02:25:37,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt +19: [2022-12-02 02:25:37,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +58: [2022-12-02 02:25:37,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. +58: [2022-12-02 02:25:37,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt +58: [2022-12-02 02:25:37,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +43: [2022-12-02 02:25:37,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. +43: [2022-12-02 02:25:37,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt +43: [2022-12-02 02:25:37,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +19: [2022-12-02 02:25:37,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. +19: [2022-12-02 02:25:37,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt +19: [2022-12-02 02:25:37,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +20: [2022-12-02 02:25:37,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. +20: [2022-12-02 02:25:37,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. +20: [2022-12-02 02:25:37,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt +20: [2022-12-02 02:25:37,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt +20: [2022-12-02 02:25:37,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. +20: [2022-12-02 02:25:37,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +20: [2022-12-02 02:25:37,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +20: [2022-12-02 02:25:37,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt +20: [2022-12-02 02:25:37,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +20: [2022-12-02 02:25:37,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. +20: [2022-12-02 02:25:37,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. +20: [2022-12-02 02:25:37,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. +20: [2022-12-02 02:25:37,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. +20: [2022-12-02 02:25:37,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt +20: [2022-12-02 02:25:37,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt +20: [2022-12-02 02:25:37,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +20: [2022-12-02 02:25:37,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt +20: [2022-12-02 02:25:37,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +20: [2022-12-02 02:25:37,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt +20: [2022-12-02 02:25:37,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +20: [2022-12-02 02:25:37,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +58: [2022-12-02 02:25:37,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. +58: [2022-12-02 02:25:37,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. +58: [2022-12-02 02:25:37,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt +58: [2022-12-02 02:25:37,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt +58: [2022-12-02 02:25:37,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +58: [2022-12-02 02:25:37,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +11: [2022-12-02 02:25:37,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +11: [2022-12-02 02:25:37,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +11: [2022-12-02 02:25:37,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +38: [2022-12-02 02:25:37,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. +38: [2022-12-02 02:25:37,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt +38: [2022-12-02 02:25:37,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. +29: [2022-12-02 02:25:37,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt +29: [2022-12-02 02:25:37,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. +29: [2022-12-02 02:25:37,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt +29: [2022-12-02 02:25:37,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt +29: [2022-12-02 02:25:37,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. +29: [2022-12-02 02:25:37,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +29: [2022-12-02 02:25:37,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt +29: [2022-12-02 02:25:37,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +29: [2022-12-02 02:25:37,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +15: [2022-12-02 02:25:37,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +15: [2022-12-02 02:25:37,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +15: [2022-12-02 02:25:37,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. + 2: [2022-12-02 02:25:37,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. + 2: [2022-12-02 02:25:37,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt + 2: [2022-12-02 02:25:37,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt + 2: [2022-12-02 02:25:37,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt + 2: [2022-12-02 02:25:37,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt + 2: [2022-12-02 02:25:37,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 2: [2022-12-02 02:25:37,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt + 2: [2022-12-02 02:25:37,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 2: [2022-12-02 02:25:37,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 2: [2022-12-02 02:25:37,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 2: [2022-12-02 02:25:37,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +56: [2022-12-02 02:25:37,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. +56: [2022-12-02 02:25:37,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt +56: [2022-12-02 02:25:37,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +48: [2022-12-02 02:25:37,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. +48: [2022-12-02 02:25:37,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt +48: [2022-12-02 02:25:37,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +46: [2022-12-02 02:25:37,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. +46: [2022-12-02 02:25:37,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt +46: [2022-12-02 02:25:37,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +46: [2022-12-02 02:25:37,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. +46: [2022-12-02 02:25:37,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt +46: [2022-12-02 02:25:37,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +46: [2022-12-02 02:25:37,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. +46: [2022-12-02 02:25:37,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt +46: [2022-12-02 02:25:37,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +20: [2022-12-02 02:25:37,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. +20: [2022-12-02 02:25:37,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt +20: [2022-12-02 02:25:37,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +14: [2022-12-02 02:25:37,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +14: [2022-12-02 02:25:37,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +14: [2022-12-02 02:25:37,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +24: [2022-12-02 02:25:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. +24: [2022-12-02 02:25:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt +24: [2022-12-02 02:25:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +59: [2022-12-02 02:25:37,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. +59: [2022-12-02 02:25:37,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt +37: [2022-12-02 02:25:37,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. +59: [2022-12-02 02:25:37,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +37: [2022-12-02 02:25:37,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt +37: [2022-12-02 02:25:37,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 3: [2022-12-02 02:25:37,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. + 3: [2022-12-02 02:25:37,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt + 3: [2022-12-02 02:25:37,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +63: [2022-12-02 02:25:37,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. +63: [2022-12-02 02:25:37,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt +63: [2022-12-02 02:25:37,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 7: [2022-12-02 02:25:37,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. + 7: [2022-12-02 02:25:37,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt + 7: [2022-12-02 02:25:37,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +38: [2022-12-02 02:25:37,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. +38: [2022-12-02 02:25:37,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt +38: [2022-12-02 02:25:37,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +31: [2022-12-02 02:25:37,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. +31: [2022-12-02 02:25:37,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt +31: [2022-12-02 02:25:37,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +51: [2022-12-02 02:25:37,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. +51: [2022-12-02 02:25:37,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. +51: [2022-12-02 02:25:37,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt +51: [2022-12-02 02:25:37,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt +51: [2022-12-02 02:25:37,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +51: [2022-12-02 02:25:37,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +33: [2022-12-02 02:25:37,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. +33: [2022-12-02 02:25:37,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt +33: [2022-12-02 02:25:37,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +12: [2022-12-02 02:25:37,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +12: [2022-12-02 02:25:37,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +12: [2022-12-02 02:25:37,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 5: [2022-12-02 02:25:37,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. + 5: [2022-12-02 02:25:37,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt + 5: [2022-12-02 02:25:37,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +48: [2022-12-02 02:25:37,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. +48: [2022-12-02 02:25:37,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt +48: [2022-12-02 02:25:37,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +49: [2022-12-02 02:25:37,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. +49: [2022-12-02 02:25:37,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt +49: [2022-12-02 02:25:37,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +46: [2022-12-02 02:25:37,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. +46: [2022-12-02 02:25:37,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt +46: [2022-12-02 02:25:37,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +10: [2022-12-02 02:25:37,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +10: [2022-12-02 02:25:37,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +10: [2022-12-02 02:25:37,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +22: [2022-12-02 02:25:37,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. +22: [2022-12-02 02:25:37,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt +22: [2022-12-02 02:25:37,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 9: [2022-12-02 02:25:37,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. + 9: [2022-12-02 02:25:37,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +54: [2022-12-02 02:25:37,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. + 9: [2022-12-02 02:25:37,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +54: [2022-12-02 02:25:37,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt +42: [2022-12-02 02:25:37,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. +54: [2022-12-02 02:25:37,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +42: [2022-12-02 02:25:37,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt +42: [2022-12-02 02:25:37,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. +52: [2022-12-02 02:25:37,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt +52: [2022-12-02 02:25:37,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +47: [2022-12-02 02:25:37,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. +47: [2022-12-02 02:25:37,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt +47: [2022-12-02 02:25:37,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +18: [2022-12-02 02:25:37,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. +18: [2022-12-02 02:25:37,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt +18: [2022-12-02 02:25:37,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +52: [2022-12-02 02:25:37,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. +52: [2022-12-02 02:25:37,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt +52: [2022-12-02 02:25:37,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 1: [2022-12-02 02:25:37,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. + 1: [2022-12-02 02:25:37,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt + 1: [2022-12-02 02:25:37,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +43: [2022-12-02 02:25:37,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. +43: [2022-12-02 02:25:37,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt +43: [2022-12-02 02:25:37,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +11: [2022-12-02 02:25:37,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +11: [2022-12-02 02:25:37,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +11: [2022-12-02 02:25:37,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +24: [2022-12-02 02:25:37,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. +24: [2022-12-02 02:25:37,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt +24: [2022-12-02 02:25:37,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +53: [2022-12-02 02:25:37,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. +53: [2022-12-02 02:25:37,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. +53: [2022-12-02 02:25:37,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. +53: [2022-12-02 02:25:37,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. +53: [2022-12-02 02:25:37,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. +53: [2022-12-02 02:25:37,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. +53: [2022-12-02 02:25:37,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. +53: [2022-12-02 02:25:37,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. +53: [2022-12-02 02:25:37,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt +53: [2022-12-02 02:25:37,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt +53: [2022-12-02 02:25:37,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt +53: [2022-12-02 02:25:37,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt +53: [2022-12-02 02:25:37,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt +53: [2022-12-02 02:25:37,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt +53: [2022-12-02 02:25:37,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt +53: [2022-12-02 02:25:37,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt +53: [2022-12-02 02:25:37,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +53: [2022-12-02 02:25:37,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +53: [2022-12-02 02:25:37,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +53: [2022-12-02 02:25:37,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +53: [2022-12-02 02:25:37,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +53: [2022-12-02 02:25:37,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +53: [2022-12-02 02:25:37,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +53: [2022-12-02 02:25:37,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! + 0: successfully saved checkpoint at iteration 5000 to checkpoints_8b7beta +63: time (ms) | save-checkpoint: 7506.10 +63: iteration 5010/ 5494 | consumed samples: 5130240 | consumed tokens: 10506731520 | elapsed time per iteration (s): 6.40 | learning rate: 2.350E-05 | global batch size: 1024 | lm loss: 2.147812E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 160.057 | TFLOPs: 35.78 | +63: iteration 5020/ 5494 | consumed samples: 5140480 | consumed tokens: 10527703040 | elapsed time per iteration (s): 5.50 | learning rate: 2.335E-05 | global batch size: 1024 | lm loss: 2.136121E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.069 | TFLOPs: 41.60 | +63: iteration 5030/ 5494 | consumed samples: 5150720 | consumed tokens: 10548674560 | elapsed time per iteration (s): 5.62 | learning rate: 2.321E-05 | global batch size: 1024 | lm loss: 2.131770E+00 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.124 | TFLOPs: 40.72 | +63: iteration 5040/ 5494 | consumed samples: 5160960 | consumed tokens: 10569646080 | elapsed time per iteration (s): 5.37 | learning rate: 2.308E-05 | global batch size: 1024 | lm loss: 2.155372E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.572 | TFLOPs: 42.61 | +63: iteration 5050/ 5494 | consumed samples: 5171200 | consumed tokens: 10590617600 | elapsed time per iteration (s): 5.41 | learning rate: 2.294E-05 | global batch size: 1024 | lm loss: 2.147984E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.314 | TFLOPs: 42.32 | +63: iteration 5060/ 5494 | consumed samples: 5181440 | consumed tokens: 10611589120 | elapsed time per iteration (s): 5.63 | learning rate: 2.281E-05 | global batch size: 1024 | lm loss: 2.129159E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.894 | TFLOPs: 40.67 | +63: iteration 5070/ 5494 | consumed samples: 5191680 | consumed tokens: 10632560640 | elapsed time per iteration (s): 5.54 | learning rate: 2.269E-05 | global batch size: 1024 | lm loss: 2.139641E+00 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.816 | TFLOPs: 41.32 | +63: iteration 5080/ 5494 | consumed samples: 5201920 | consumed tokens: 10653532160 | elapsed time per iteration (s): 5.98 | learning rate: 2.256E-05 | global batch size: 1024 | lm loss: 2.146426E+00 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.244 | TFLOPs: 38.28 | +63: iteration 5090/ 5494 | consumed samples: 5212160 | consumed tokens: 10674503680 | elapsed time per iteration (s): 5.55 | learning rate: 2.244E-05 | global batch size: 1024 | lm loss: 2.138970E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.646 | TFLOPs: 41.28 | +63: iteration 5100/ 5494 | consumed samples: 5222400 | consumed tokens: 10695475200 | elapsed time per iteration (s): 5.87 | learning rate: 2.232E-05 | global batch size: 1024 | lm loss: 2.140875E+00 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.434 | TFLOPs: 39.00 | +63: iteration 5110/ 5494 | consumed samples: 5232640 | consumed tokens: 10716446720 | elapsed time per iteration (s): 5.59 | learning rate: 2.221E-05 | global batch size: 1024 | lm loss: 2.146891E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.087 | TFLOPs: 40.93 | +63: iteration 5120/ 5494 | consumed samples: 5242880 | consumed tokens: 10737418240 | elapsed time per iteration (s): 5.50 | learning rate: 2.209E-05 | global batch size: 1024 | lm loss: 2.141326E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.320 | TFLOPs: 41.66 | +63: iteration 5130/ 5494 | consumed samples: 5253120 | consumed tokens: 10758389760 | elapsed time per iteration (s): 5.60 | learning rate: 2.198E-05 | global batch size: 1024 | lm loss: 2.157316E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.849 | TFLOPs: 40.88 | +63: iteration 5140/ 5494 | consumed samples: 5263360 | consumed tokens: 10779361280 | elapsed time per iteration (s): 5.63 | learning rate: 2.188E-05 | global batch size: 1024 | lm loss: 2.145672E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.816 | TFLOPs: 40.65 | +63: iteration 5150/ 5494 | consumed samples: 5273600 | consumed tokens: 10800332800 | elapsed time per iteration (s): 5.62 | learning rate: 2.177E-05 | global batch size: 1024 | lm loss: 2.130510E+00 | grad norm: 0.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.057 | TFLOPs: 40.70 | +63: iteration 5160/ 5494 | consumed samples: 5283840 | consumed tokens: 10821304320 | elapsed time per iteration (s): 5.48 | learning rate: 2.167E-05 | global batch size: 1024 | lm loss: 2.142476E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.885 | TFLOPs: 41.78 | +63: iteration 5170/ 5494 | consumed samples: 5294080 | consumed tokens: 10842275840 | elapsed time per iteration (s): 5.41 | learning rate: 2.157E-05 | global batch size: 1024 | lm loss: 2.146174E+00 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.398 | TFLOPs: 42.34 | +63: iteration 5180/ 5494 | consumed samples: 5304320 | consumed tokens: 10863247360 | elapsed time per iteration (s): 5.38 | learning rate: 2.148E-05 | global batch size: 1024 | lm loss: 2.135205E+00 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.350 | TFLOPs: 42.56 | +63: iteration 5190/ 5494 | consumed samples: 5314560 | consumed tokens: 10884218880 | elapsed time per iteration (s): 5.70 | learning rate: 2.138E-05 | global batch size: 1024 | lm loss: 2.141199E+00 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.695 | TFLOPs: 40.17 | +63: iteration 5200/ 5494 | consumed samples: 5324800 | consumed tokens: 10905190400 | elapsed time per iteration (s): 5.95 | learning rate: 2.130E-05 | global batch size: 1024 | lm loss: 2.148487E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.019 | TFLOPs: 38.46 | +63: iteration 5210/ 5494 | consumed samples: 5335040 | consumed tokens: 10926161920 | elapsed time per iteration (s): 5.47 | learning rate: 2.121E-05 | global batch size: 1024 | lm loss: 2.132035E+00 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.071 | TFLOPs: 41.82 | +63: iteration 5220/ 5494 | consumed samples: 5345280 | consumed tokens: 10947133440 | elapsed time per iteration (s): 5.41 | learning rate: 2.113E-05 | global batch size: 1024 | lm loss: 2.149103E+00 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.229 | TFLOPs: 42.31 | +63: iteration 5230/ 5494 | consumed samples: 5355520 | consumed tokens: 10968104960 | elapsed time per iteration (s): 5.77 | learning rate: 2.105E-05 | global batch size: 1024 | lm loss: 2.131770E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.601 | TFLOPs: 39.71 | +63: iteration 5240/ 5494 | consumed samples: 5365760 | consumed tokens: 10989076480 | elapsed time per iteration (s): 5.53 | learning rate: 2.097E-05 | global batch size: 1024 | lm loss: 2.129029E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.226 | TFLOPs: 41.41 | +63: iteration 5250/ 5494 | consumed samples: 5376000 | consumed tokens: 11010048000 | elapsed time per iteration (s): 5.62 | learning rate: 2.089E-05 | global batch size: 1024 | lm loss: 2.133801E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.128 | TFLOPs: 40.72 | +63: iteration 5260/ 5494 | consumed samples: 5386240 | consumed tokens: 11031019520 | elapsed time per iteration (s): 5.51 | learning rate: 2.082E-05 | global batch size: 1024 | lm loss: 2.150228E+00 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.701 | TFLOPs: 41.52 | +63: iteration 5270/ 5494 | consumed samples: 5396480 | consumed tokens: 11051991040 | elapsed time per iteration (s): 5.39 | learning rate: 2.075E-05 | global batch size: 1024 | lm loss: 2.138353E+00 | grad norm: 0.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.924 | TFLOPs: 42.46 | +63: iteration 5280/ 5494 | consumed samples: 5406720 | consumed tokens: 11072962560 | elapsed time per iteration (s): 5.64 | learning rate: 2.069E-05 | global batch size: 1024 | lm loss: 2.143057E+00 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.414 | TFLOPs: 40.56 | +63: iteration 5290/ 5494 | consumed samples: 5416960 | consumed tokens: 11093934080 | elapsed time per iteration (s): 5.76 | learning rate: 2.062E-05 | global batch size: 1024 | lm loss: 2.141732E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.725 | TFLOPs: 39.73 | +63: iteration 5300/ 5494 | consumed samples: 5427200 | consumed tokens: 11114905600 | elapsed time per iteration (s): 5.89 | learning rate: 2.057E-05 | global batch size: 1024 | lm loss: 2.130778E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.873 | TFLOPs: 38.87 | +63: iteration 5310/ 5494 | consumed samples: 5437440 | consumed tokens: 11135877120 | elapsed time per iteration (s): 5.70 | learning rate: 2.051E-05 | global batch size: 1024 | lm loss: 2.119678E+00 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.520 | TFLOPs: 40.13 | +63: iteration 5320/ 5494 | consumed samples: 5447680 | consumed tokens: 11156848640 | elapsed time per iteration (s): 5.49 | learning rate: 2.045E-05 | global batch size: 1024 | lm loss: 2.142694E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.383 | TFLOPs: 41.67 | +63: iteration 5330/ 5494 | consumed samples: 5457920 | consumed tokens: 11177820160 | elapsed time per iteration (s): 5.39 | learning rate: 2.040E-05 | global batch size: 1024 | lm loss: 2.145358E+00 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.811 | TFLOPs: 42.44 | +63: iteration 5340/ 5494 | consumed samples: 5468160 | consumed tokens: 11198791680 | elapsed time per iteration (s): 5.61 | learning rate: 2.036E-05 | global batch size: 1024 | lm loss: 2.142475E+00 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.593 | TFLOPs: 40.82 | +63: iteration 5350/ 5494 | consumed samples: 5478400 | consumed tokens: 11219763200 | elapsed time per iteration (s): 5.60 | learning rate: 2.031E-05 | global batch size: 1024 | lm loss: 2.137687E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.729 | TFLOPs: 40.85 | +63: iteration 5360/ 5494 | consumed samples: 5488640 | consumed tokens: 11240734720 | elapsed time per iteration (s): 5.57 | learning rate: 2.027E-05 | global batch size: 1024 | lm loss: 2.155963E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.978 | TFLOPs: 41.13 | +63: iteration 5370/ 5494 | consumed samples: 5498880 | consumed tokens: 11261706240 | elapsed time per iteration (s): 5.54 | learning rate: 2.023E-05 | global batch size: 1024 | lm loss: 2.133789E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.982 | TFLOPs: 41.36 | +63: iteration 5380/ 5494 | consumed samples: 5509120 | consumed tokens: 11282677760 | elapsed time per iteration (s): 5.50 | learning rate: 2.020E-05 | global batch size: 1024 | lm loss: 2.139968E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.318 | TFLOPs: 41.65 | +63: iteration 5390/ 5494 | consumed samples: 5519360 | consumed tokens: 11303649280 | elapsed time per iteration (s): 5.55 | learning rate: 2.016E-05 | global batch size: 1024 | lm loss: 2.122280E+00 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.459 | TFLOPs: 41.24 | +63: iteration 5400/ 5494 | consumed samples: 5529600 | consumed tokens: 11324620800 | elapsed time per iteration (s): 5.65 | learning rate: 2.013E-05 | global batch size: 1024 | lm loss: 2.128817E+00 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.367 | TFLOPs: 40.55 | +63: iteration 5410/ 5494 | consumed samples: 5539840 | consumed tokens: 11345592320 | elapsed time per iteration (s): 5.60 | learning rate: 2.011E-05 | global batch size: 1024 | lm loss: 2.158111E+00 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.903 | TFLOPs: 40.89 | +63: iteration 5420/ 5494 | consumed samples: 5550080 | consumed tokens: 11366563840 | elapsed time per iteration (s): 5.80 | learning rate: 2.008E-05 | global batch size: 1024 | lm loss: 2.129410E+00 | grad norm: 0.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.674 | TFLOPs: 39.50 | +63: iteration 5430/ 5494 | consumed samples: 5560320 | consumed tokens: 11387535360 | elapsed time per iteration (s): 5.51 | learning rate: 2.006E-05 | global batch size: 1024 | lm loss: 2.119676E+00 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.869 | TFLOPs: 41.55 | +63: iteration 5440/ 5494 | consumed samples: 5570560 | consumed tokens: 11408506880 | elapsed time per iteration (s): 5.55 | learning rate: 2.004E-05 | global batch size: 1024 | lm loss: 2.127531E+00 | grad norm: 0.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.487 | TFLOPs: 41.25 | +63: iteration 5450/ 5494 | consumed samples: 5580800 | consumed tokens: 11429478400 | elapsed time per iteration (s): 7.98 | learning rate: 2.003E-05 | global batch size: 1024 | lm loss: 2.131797E+00 | grad norm: 0.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 128.368 | TFLOPs: 28.70 | +63: iteration 5460/ 5494 | consumed samples: 5591040 | consumed tokens: 11450449920 | elapsed time per iteration (s): 5.52 | learning rate: 2.002E-05 | global batch size: 1024 | lm loss: 2.138086E+00 | grad norm: 0.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.474 | TFLOPs: 41.47 | +63: iteration 5470/ 5494 | consumed samples: 5601280 | consumed tokens: 11471421440 | elapsed time per iteration (s): 5.50 | learning rate: 2.001E-05 | global batch size: 1024 | lm loss: 2.148790E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.019 | TFLOPs: 41.59 | +63: iteration 5480/ 5494 | consumed samples: 5611520 | consumed tokens: 11492392960 | elapsed time per iteration (s): 5.52 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.132012E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.528 | TFLOPs: 41.48 | +63: iteration 5490/ 5494 | consumed samples: 5621760 | consumed tokens: 11513364480 | elapsed time per iteration (s): 5.54 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.146653E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.004 | TFLOPs: 41.36 | + 0: [after training is done] datetime: 2022-12-02 03:11:58 + 0: saving checkpoint at iteration 5494 to checkpoints_8b7beta +63: ------------------------------------------------------------------------------------------------------------ +63: valid loss at the end of training for val data | lm loss value: 2.101619E+00 | lm loss PPL: 8.179404E+00 | +63: ------------------------------------------------------------------------------------------------------------ + 0: [2022-12-02 03:12:00,412] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5494 is begin to save! +32: [2022-12-02 03:12:00,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_24-model_01-model_states.pt... +32: [2022-12-02 03:12:00,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_24-model_00-model_states.pt... + 0: [2022-12-02 03:12:00,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_01-model_00-model_states.pt... + 0: [2022-12-02 03:12:00,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_01-model_01-model_states.pt... +32: [2022-12-02 03:12:00,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_24-model_00-model_states.pt. +32: [2022-12-02 03:12:00,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_24-model_01-model_states.pt. +32: [2022-12-02 03:12:00,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_25-model_00-model_states.pt... +32: [2022-12-02 03:12:00,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_25-model_01-model_states.pt... + 0: [2022-12-02 03:12:00,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_01-model_01-model_states.pt. + 0: [2022-12-02 03:12:00,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_01-model_00-model_states.pt. + 0: [2022-12-02 03:12:00,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_03-model_01-model_states.pt... + 0: [2022-12-02 03:12:00,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_03-model_00-model_states.pt... +32: [2022-12-02 03:12:01,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_25-model_00-model_states.pt. +32: [2022-12-02 03:12:01,151] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_26-model_00-model_states.pt... +32: [2022-12-02 03:12:01,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_25-model_01-model_states.pt. +32: [2022-12-02 03:12:01,159] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_26-model_01-model_states.pt... + 0: [2022-12-02 03:12:01,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_03-model_01-model_states.pt. + 0: [2022-12-02 03:12:01,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_03-model_00-model_states.pt. + 0: [2022-12-02 03:12:01,167] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_04-model_01-model_states.pt... + 0: [2022-12-02 03:12:01,167] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_04-model_00-model_states.pt... + 0: [2022-12-02 03:12:01,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_04-model_01-model_states.pt. + 0: [2022-12-02 03:12:01,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_05-model_01-model_states.pt... +32: [2022-12-02 03:12:01,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_26-model_01-model_states.pt. + 0: [2022-12-02 03:12:01,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_04-model_00-model_states.pt. +32: [2022-12-02 03:12:01,433] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_27-model_01-model_states.pt... + 0: [2022-12-02 03:12:01,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_05-model_00-model_states.pt... +32: [2022-12-02 03:12:01,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_26-model_00-model_states.pt. +32: [2022-12-02 03:12:01,436] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_27-model_00-model_states.pt... + 0: [2022-12-02 03:12:01,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_05-model_00-model_states.pt. + 0: [2022-12-02 03:12:01,684] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_06-model_00-model_states.pt... + 0: [2022-12-02 03:12:01,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_05-model_01-model_states.pt. + 0: [2022-12-02 03:12:01,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_06-model_01-model_states.pt... +32: [2022-12-02 03:12:01,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_27-model_01-model_states.pt. +32: [2022-12-02 03:12:01,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_28-model_01-model_states.pt... +32: [2022-12-02 03:12:01,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_27-model_00-model_states.pt. +32: [2022-12-02 03:12:01,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_28-model_00-model_states.pt... + 0: [2022-12-02 03:12:01,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_06-model_00-model_states.pt. + 0: [2022-12-02 03:12:01,945] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_07-model_00-model_states.pt... +32: [2022-12-02 03:12:01,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_28-model_00-model_states.pt. +32: [2022-12-02 03:12:01,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_28-model_01-model_states.pt. +32: [2022-12-02 03:12:01,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_29-model_00-model_states.pt... +32: [2022-12-02 03:12:01,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_29-model_01-model_states.pt... + 0: [2022-12-02 03:12:01,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_06-model_01-model_states.pt. + 0: [2022-12-02 03:12:01,962] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_07-model_01-model_states.pt... + 0: [2022-12-02 03:12:02,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_07-model_01-model_states.pt. + 0: [2022-12-02 03:12:02,199] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_08-model_01-model_states.pt... + 0: [2022-12-02 03:12:02,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_07-model_00-model_states.pt. + 0: [2022-12-02 03:12:02,209] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_08-model_00-model_states.pt... +32: [2022-12-02 03:12:02,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_29-model_01-model_states.pt. +32: [2022-12-02 03:12:02,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_29-model_00-model_states.pt. +32: [2022-12-02 03:12:02,211] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_30-model_00-model_states.pt... +32: [2022-12-02 03:12:02,211] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_30-model_01-model_states.pt... + 0: [2022-12-02 03:12:02,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_08-model_00-model_states.pt. + 0: [2022-12-02 03:12:02,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_09-model_00-model_states.pt... + 0: [2022-12-02 03:12:02,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_08-model_01-model_states.pt. + 0: [2022-12-02 03:12:02,451] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_09-model_01-model_states.pt... +32: [2022-12-02 03:12:02,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_30-model_01-model_states.pt. +32: [2022-12-02 03:12:02,475] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_31-model_01-model_states.pt... +32: [2022-12-02 03:12:02,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_30-model_00-model_states.pt. +32: [2022-12-02 03:12:02,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_31-model_00-model_states.pt... + 0: [2022-12-02 03:12:02,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_09-model_01-model_states.pt. + 0: [2022-12-02 03:12:02,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_10-model_01-model_states.pt... + 0: [2022-12-02 03:12:02,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_09-model_00-model_states.pt. + 0: [2022-12-02 03:12:02,719] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_10-model_00-model_states.pt... +32: [2022-12-02 03:12:02,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_31-model_00-model_states.pt. +32: [2022-12-02 03:12:02,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_32-model_00-model_states.pt... +32: [2022-12-02 03:12:02,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_31-model_01-model_states.pt. +32: [2022-12-02 03:12:02,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_32-model_01-model_states.pt... +32: [2022-12-02 03:12:02,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_32-model_00-model_states.pt. +32: [2022-12-02 03:12:02,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_33-model_00-model_states.pt... +32: [2022-12-02 03:12:02,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_32-model_01-model_states.pt. +32: [2022-12-02 03:12:02,974] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_33-model_01-model_states.pt... + 0: [2022-12-02 03:12:02,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_10-model_01-model_states.pt. + 0: [2022-12-02 03:12:02,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_11-model_01-model_states.pt... + 0: [2022-12-02 03:12:02,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_10-model_00-model_states.pt. + 0: [2022-12-02 03:12:02,992] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_11-model_00-model_states.pt... + 0: [2022-12-02 03:12:03,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_11-model_00-model_states.pt. + 0: [2022-12-02 03:12:03,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_11-model_01-model_states.pt. + 0: [2022-12-02 03:12:03,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_12-model_00-model_states.pt... + 0: [2022-12-02 03:12:03,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_12-model_01-model_states.pt... +32: [2022-12-02 03:12:03,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_33-model_01-model_states.pt. +32: [2022-12-02 03:12:03,241] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_34-model_01-model_states.pt... +32: [2022-12-02 03:12:03,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_33-model_00-model_states.pt. +32: [2022-12-02 03:12:03,242] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_34-model_00-model_states.pt... + 0: [2022-12-02 03:12:03,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_12-model_01-model_states.pt. + 0: [2022-12-02 03:12:03,476] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_13-model_01-model_states.pt... + 0: [2022-12-02 03:12:03,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_12-model_00-model_states.pt. + 0: [2022-12-02 03:12:03,477] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_13-model_00-model_states.pt... +32: [2022-12-02 03:12:03,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_34-model_00-model_states.pt. +32: [2022-12-02 03:12:03,504] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_35-model_00-model_states.pt... +32: [2022-12-02 03:12:03,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_34-model_01-model_states.pt. +32: [2022-12-02 03:12:03,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_35-model_01-model_states.pt... + 0: [2022-12-02 03:12:03,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_13-model_01-model_states.pt. + 0: [2022-12-02 03:12:03,728] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_14-model_01-model_states.pt... + 0: [2022-12-02 03:12:03,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_13-model_00-model_states.pt. + 0: [2022-12-02 03:12:03,755] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_14-model_00-model_states.pt... +32: [2022-12-02 03:12:03,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_35-model_00-model_states.pt. +32: [2022-12-02 03:12:03,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_36-model_00-model_states.pt... +32: [2022-12-02 03:12:03,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_35-model_01-model_states.pt. +32: [2022-12-02 03:12:03,766] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_36-model_01-model_states.pt... + 0: [2022-12-02 03:12:03,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_14-model_01-model_states.pt. + 0: [2022-12-02 03:12:03,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_15-model_01-model_states.pt... + 0: [2022-12-02 03:12:03,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_14-model_00-model_states.pt. + 0: [2022-12-02 03:12:03,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_15-model_00-model_states.pt... +32: [2022-12-02 03:12:04,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_36-model_00-model_states.pt. +32: [2022-12-02 03:12:04,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_37-model_00-model_states.pt... +32: [2022-12-02 03:12:04,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_36-model_01-model_states.pt. +32: [2022-12-02 03:12:04,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_37-model_01-model_states.pt... + 0: [2022-12-02 03:12:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_15-model_01-model_states.pt. + 0: [2022-12-02 03:12:04,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_16-model_01-model_states.pt... + 0: [2022-12-02 03:12:04,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_15-model_00-model_states.pt. + 0: [2022-12-02 03:12:04,289] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_16-model_00-model_states.pt... +32: [2022-12-02 03:12:04,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_37-model_00-model_states.pt. +32: [2022-12-02 03:12:04,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_37-model_01-model_states.pt. +32: [2022-12-02 03:12:04,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_38-model_00-model_states.pt... +32: [2022-12-02 03:12:04,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_38-model_01-model_states.pt... + 0: [2022-12-02 03:12:04,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_16-model_01-model_states.pt. + 0: [2022-12-02 03:12:04,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_17-model_01-model_states.pt... + 0: [2022-12-02 03:12:04,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_16-model_00-model_states.pt. + 0: [2022-12-02 03:12:04,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_17-model_00-model_states.pt... +32: [2022-12-02 03:12:04,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_38-model_01-model_states.pt. +32: [2022-12-02 03:12:04,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_39-model_01-model_states.pt... +32: [2022-12-02 03:12:04,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_38-model_00-model_states.pt. +32: [2022-12-02 03:12:04,616] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_39-model_00-model_states.pt... + 0: [2022-12-02 03:12:04,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_17-model_00-model_states.pt. + 0: [2022-12-02 03:12:04,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_17-model_01-model_states.pt. + 0: [2022-12-02 03:12:04,777] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_18-model_01-model_states.pt... + 0: [2022-12-02 03:12:04,777] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_18-model_00-model_states.pt... +32: [2022-12-02 03:12:04,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_39-model_01-model_states.pt. +32: [2022-12-02 03:12:04,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_40-model_01-model_states.pt... +32: [2022-12-02 03:12:04,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_39-model_00-model_states.pt. +32: [2022-12-02 03:12:04,883] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_40-model_00-model_states.pt... + 0: [2022-12-02 03:12:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_18-model_01-model_states.pt. + 0: [2022-12-02 03:12:05,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_19-model_01-model_states.pt... + 0: [2022-12-02 03:12:05,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_18-model_00-model_states.pt. + 0: [2022-12-02 03:12:05,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_19-model_00-model_states.pt... +32: [2022-12-02 03:12:05,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_40-model_00-model_states.pt. +32: [2022-12-02 03:12:05,134] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_41-model_00-model_states.pt... +32: [2022-12-02 03:12:05,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_40-model_01-model_states.pt. +32: [2022-12-02 03:12:05,144] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_41-model_01-model_states.pt... + 0: [2022-12-02 03:12:05,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_19-model_00-model_states.pt. + 0: [2022-12-02 03:12:05,246] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_20-model_00-model_states.pt... + 0: [2022-12-02 03:12:05,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_19-model_01-model_states.pt. + 0: [2022-12-02 03:12:05,249] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_20-model_01-model_states.pt... +32: [2022-12-02 03:12:05,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_41-model_00-model_states.pt. +32: [2022-12-02 03:12:05,370] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_42-model_00-model_states.pt... +32: [2022-12-02 03:12:05,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_41-model_01-model_states.pt. +32: [2022-12-02 03:12:05,384] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_42-model_01-model_states.pt... + 0: [2022-12-02 03:12:05,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_20-model_00-model_states.pt. + 0: [2022-12-02 03:12:05,494] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_21-model_00-model_states.pt... + 0: [2022-12-02 03:12:05,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_20-model_01-model_states.pt. + 0: [2022-12-02 03:12:05,503] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_21-model_01-model_states.pt... +32: [2022-12-02 03:12:05,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_42-model_00-model_states.pt. +32: [2022-12-02 03:12:05,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_43-model_00-model_states.pt... +32: [2022-12-02 03:12:05,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_42-model_01-model_states.pt. +32: [2022-12-02 03:12:05,625] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_43-model_01-model_states.pt... + 0: [2022-12-02 03:12:05,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_21-model_00-model_states.pt. + 0: [2022-12-02 03:12:05,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_22-model_00-model_states.pt... + 0: [2022-12-02 03:12:05,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_21-model_01-model_states.pt. + 0: [2022-12-02 03:12:05,731] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_22-model_01-model_states.pt... +32: [2022-12-02 03:12:05,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_43-model_01-model_states.pt. +32: [2022-12-02 03:12:05,853] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_44-model_01-model_states.pt... +32: [2022-12-02 03:12:05,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_43-model_00-model_states.pt. +32: [2022-12-02 03:12:05,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_44-model_00-model_states.pt... + 0: [2022-12-02 03:12:05,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_22-model_00-model_states.pt. + 0: [2022-12-02 03:12:05,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_23-model_00-model_states.pt... + 0: [2022-12-02 03:12:05,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_22-model_01-model_states.pt. + 0: [2022-12-02 03:12:05,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_23-model_01-model_states.pt... +32: [2022-12-02 03:12:06,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_44-model_01-model_states.pt. +32: [2022-12-02 03:12:06,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_46-model_01-model_states.pt... +32: [2022-12-02 03:12:06,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_44-model_00-model_states.pt. +32: [2022-12-02 03:12:06,133] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/layer_46-model_00-model_states.pt... +32: [2022-12-02 03:12:06,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_46-model_01-model_states.pt. +32: [2022-12-02 03:12:06,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_46-model_00-model_states.pt. +32: [2022-12-02 03:12:06,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/mp_rank_03_model_states.pt... +32: [2022-12-02 03:12:06,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/mp_rank_02_model_states.pt... +32: [2022-12-02 03:12:06,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/mp_rank_03_model_states.pt. +32: [2022-12-02 03:12:06,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/mp_rank_02_model_states.pt. + 0: [2022-12-02 03:12:06,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_23-model_00-model_states.pt. + 0: [2022-12-02 03:12:06,185] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7beta/global_step5494/mp_rank_00_model_states.pt + 0: [2022-12-02 03:12:06,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/mp_rank_00_model_states.pt... + 0: [2022-12-02 03:12:06,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/layer_23-model_01-model_states.pt. + 0: [2022-12-02 03:12:06,195] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7beta/global_step5494/mp_rank_01_model_states.pt + 0: [2022-12-02 03:12:06,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/mp_rank_01_model_states.pt... + 0: [2022-12-02 03:12:06,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/mp_rank_01_model_states.pt. + 0: [2022-12-02 03:12:06,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/mp_rank_00_model_states.pt. +56: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... +56: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... +56: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... +56: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... +62: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... +62: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... +62: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... +62: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... +60: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... +60: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... +60: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... +32: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... +49: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... +52: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... +52: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... +52: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... +52: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... +59: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... +59: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... +48: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... +48: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... +48: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... +48: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... +54: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... +54: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... +54: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... +56: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... +62: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... +62: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... +60: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... +51: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... +51: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... +51: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... +51: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... +63: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... +63: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... +63: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... +63: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... +39: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... +39: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... +39: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... +39: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... +57: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... +57: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... +57: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... +43: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... +43: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... +43: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... +43: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... +38: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... +38: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... +38: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... +38: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... +36: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... +42: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... +42: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... +42: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... +58: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... +58: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... +58: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... +58: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... +44: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... +44: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... +44: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... +34: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... +34: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... +34: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... +34: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... +32: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... +32: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... +50: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... +50: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... +46: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... +46: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... +46: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... +46: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... +49: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... +41: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... +41: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... +41: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... +41: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... +55: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... +55: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... +55: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... +37: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... +37: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... +37: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... +33: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... +33: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... +33: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... +33: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... +52: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... +61: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... +61: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... +61: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... +61: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... +45: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... +45: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... +47: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... +47: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... +47: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... +47: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... +59: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... +59: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... +35: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... +35: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... +35: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... +35: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... +48: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... +53: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... +53: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... +53: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... +53: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... +40: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... +40: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... +40: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... + 0: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +54: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... +56: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... +62: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... +62: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... + 4: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +60: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... +60: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... +60: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... +60: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... +51: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... +63: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... +63: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... +63: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... +17: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... +17: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... +39: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... +39: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... + 7: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... + 7: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... + 7: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +57: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... +43: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... +38: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... +38: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... +36: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... +36: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... +24: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... +42: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... +58: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... +58: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... +58: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... +44: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... +34: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... +34: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... +32: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... +50: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... +46: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... +26: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... +26: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... +49: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... +49: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... + 5: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... + 5: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +20: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... +20: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... +41: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... +41: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... +55: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... +55: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... +10: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +37: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... +37: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... +37: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... + 8: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +18: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... +18: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... +33: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... +33: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... +33: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... +52: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... +52: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... + 6: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +31: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... +31: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... +61: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... +45: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... +45: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... +47: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... + 3: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... + 9: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... + 9: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... + 9: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +59: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... +59: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... +35: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... +35: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... +19: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... +19: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... +19: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... +48: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... +53: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... +53: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... +40: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... +40: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... +30: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... +16: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... +16: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... +16: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... + 0: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +54: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... +54: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... +54: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... +56: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... + 4: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +51: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... +63: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... +17: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... + 1: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +39: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... +39: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... + 7: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... +11: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +57: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... +27: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... +27: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... +43: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... +43: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... +25: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... +25: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... +25: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... +13: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +13: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +13: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +14: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +14: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +38: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... +38: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... +22: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... +36: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... +24: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... +24: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... +42: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... +58: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... +44: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... +34: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... +28: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... +28: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... +32: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... +50: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... +50: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... +46: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... +46: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... +46: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... +26: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... +49: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... + 5: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... + 5: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +20: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... +41: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... +55: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... +55: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... +10: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +37: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... + 8: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +18: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... +21: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... +21: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... +33: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... +52: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... + 6: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +31: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... +31: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... +61: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... +61: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... +15: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +15: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +45: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... +47: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... + 3: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... + 3: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... + 3: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... + 9: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... + 9: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... +59: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... +12: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... + 2: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... + 2: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +29: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... +29: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... +35: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... +19: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... +19: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... +48: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... +53: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... +53: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... +40: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... +30: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... +23: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... +16: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... + 0: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +54: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... +56: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... + 4: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... + 4: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... + 4: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... +51: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... +17: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... + 1: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... + 1: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... + 7: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... +11: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +11: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +11: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +57: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... +27: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... +43: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... +25: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... +25: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... +13: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +14: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +22: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... +36: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... +36: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... +36: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... +24: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... +24: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... +42: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... +42: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... +44: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... +34: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... +28: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... +32: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... +50: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... +26: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... +26: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... +49: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... + 5: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... + 5: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... + 5: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... +20: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... +41: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... +55: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... +10: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +37: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... + 8: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +18: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... +21: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... + 6: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +31: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... +61: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... +15: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +45: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... +45: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... +47: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... + 3: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... + 3: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... + 3: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... + 9: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... + 9: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... + 9: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... +59: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... +12: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... + 2: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... +29: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... +29: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... +35: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... +19: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... +48: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... +40: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... +30: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... +30: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... +30: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... +23: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... +16: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... +16: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... + 0: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... + 4: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... + 4: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... +51: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... +17: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... +17: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... +17: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... + 1: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... + 7: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +11: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... +57: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... +27: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... +25: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... +25: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... +13: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... +14: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +22: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... +36: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... +24: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... +24: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... +24: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... +42: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... +44: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... +28: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... +32: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... +50: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... +26: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... +49: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... + 5: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... +20: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... + 8: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... + 8: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... +18: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... +21: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... +21: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... + 6: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... + 6: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... +31: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... +15: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... +45: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... +47: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... + 3: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... +12: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... + 2: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... + 2: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +29: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... +19: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... +40: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... +30: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... +23: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... +23: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... +23: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... +23: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... +16: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... +16: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... + 0: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... + 4: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +17: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... + 1: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... + 1: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... + 7: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... +11: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... +57: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... +27: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... +25: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... +13: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... +13: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... +14: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... +22: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... +22: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... +44: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... +28: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... +32: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... +50: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... +26: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... +49: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... +20: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... + 8: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... + 8: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... + 8: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... +18: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... +21: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... + 6: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... +31: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... +15: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... +15: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... +12: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... + 2: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... + 2: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... +29: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... +19: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... +30: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... +23: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... + 0: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... + 1: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... + 7: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... +11: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... +27: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... +13: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... +14: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... +14: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... +22: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... +22: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... +28: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... +26: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... +20: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... +18: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... +21: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... + 6: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... +31: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... +15: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... +12: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... +12: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... + 2: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +29: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... +30: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... +23: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... + 0: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... + 1: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... +11: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... +27: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... +14: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... +22: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... +28: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... +20: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... +18: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... +21: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... + 6: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... +15: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +12: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +29: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... + 0: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... +27: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... +28: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... +12: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... +10: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... +10: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... +10: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... +10: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... +10: [2022-12-02 03:12:06,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... + 0: [2022-12-02 03:12:06,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +32: [2022-12-02 03:12:06,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. +32: [2022-12-02 03:12:06,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt +32: [2022-12-02 03:12:06,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +59: [2022-12-02 03:12:06,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. +59: [2022-12-02 03:12:06,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt +59: [2022-12-02 03:12:06,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +32: [2022-12-02 03:12:06,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. +32: [2022-12-02 03:12:06,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt +32: [2022-12-02 03:12:06,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +36: [2022-12-02 03:12:06,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. +36: [2022-12-02 03:12:06,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt +36: [2022-12-02 03:12:06,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +10: [2022-12-02 03:12:06,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +10: [2022-12-02 03:12:06,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +10: [2022-12-02 03:12:06,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:06,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. +38: [2022-12-02 03:12:06,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. +38: [2022-12-02 03:12:06,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt +38: [2022-12-02 03:12:06,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:06,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. +53: [2022-12-02 03:12:06,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt +54: [2022-12-02 03:12:06,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. +32: [2022-12-02 03:12:06,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. +53: [2022-12-02 03:12:06,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +54: [2022-12-02 03:12:06,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt +54: [2022-12-02 03:12:06,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +32: [2022-12-02 03:12:06,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt +32: [2022-12-02 03:12:06,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +33: [2022-12-02 03:12:06,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. +33: [2022-12-02 03:12:06,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. +33: [2022-12-02 03:12:06,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt +33: [2022-12-02 03:12:06,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt +33: [2022-12-02 03:12:06,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +33: [2022-12-02 03:12:06,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 0: [2022-12-02 03:12:06,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. + 0: [2022-12-02 03:12:06,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt + 0: [2022-12-02 03:12:06,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +20: [2022-12-02 03:12:06,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. +20: [2022-12-02 03:12:06,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt +20: [2022-12-02 03:12:06,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +20: [2022-12-02 03:12:06,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. +20: [2022-12-02 03:12:06,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt +20: [2022-12-02 03:12:06,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +32: [2022-12-02 03:12:06,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. +32: [2022-12-02 03:12:06,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt +32: [2022-12-02 03:12:06,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +62: [2022-12-02 03:12:06,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. +17: [2022-12-02 03:12:06,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. + 6: [2022-12-02 03:12:06,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +17: [2022-12-02 03:12:06,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt +17: [2022-12-02 03:12:06,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 6: [2022-12-02 03:12:06,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt + 6: [2022-12-02 03:12:06,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +62: [2022-12-02 03:12:06,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt +62: [2022-12-02 03:12:06,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +48: [2022-12-02 03:12:06,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. +48: [2022-12-02 03:12:06,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt + 6: [2022-12-02 03:12:06,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +48: [2022-12-02 03:12:06,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 6: [2022-12-02 03:12:06,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt + 6: [2022-12-02 03:12:06,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 0: [2022-12-02 03:12:06,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt + 0: [2022-12-02 03:12:06,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 0: [2022-12-02 03:12:06,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. + 0: [2022-12-02 03:12:06,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt + 0: [2022-12-02 03:12:06,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +57: [2022-12-02 03:12:06,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. +57: [2022-12-02 03:12:06,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt +57: [2022-12-02 03:12:06,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +17: [2022-12-02 03:12:06,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. +17: [2022-12-02 03:12:06,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt +17: [2022-12-02 03:12:06,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +45: [2022-12-02 03:12:06,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. +45: [2022-12-02 03:12:06,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt +45: [2022-12-02 03:12:06,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +16: [2022-12-02 03:12:06,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. +16: [2022-12-02 03:12:06,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt +16: [2022-12-02 03:12:06,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +16: [2022-12-02 03:12:06,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. +16: [2022-12-02 03:12:06,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt +16: [2022-12-02 03:12:06,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +43: [2022-12-02 03:12:06,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. +43: [2022-12-02 03:12:06,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt +43: [2022-12-02 03:12:06,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +42: [2022-12-02 03:12:06,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. +42: [2022-12-02 03:12:06,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt +42: [2022-12-02 03:12:06,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +13: [2022-12-02 03:12:06,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. +13: [2022-12-02 03:12:06,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt +13: [2022-12-02 03:12:06,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +11: [2022-12-02 03:12:06,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +11: [2022-12-02 03:12:06,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +11: [2022-12-02 03:12:06,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +45: [2022-12-02 03:12:06,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. +45: [2022-12-02 03:12:06,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt +45: [2022-12-02 03:12:06,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +38: [2022-12-02 03:12:06,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. +38: [2022-12-02 03:12:06,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt + 0: [2022-12-02 03:12:06,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +38: [2022-12-02 03:12:06,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 0: [2022-12-02 03:12:06,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt + 0: [2022-12-02 03:12:06,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +45: [2022-12-02 03:12:06,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. +45: [2022-12-02 03:12:06,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt +45: [2022-12-02 03:12:06,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +38: [2022-12-02 03:12:06,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. +38: [2022-12-02 03:12:06,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt +10: [2022-12-02 03:12:06,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +38: [2022-12-02 03:12:06,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +10: [2022-12-02 03:12:06,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +10: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +13: [2022-12-02 03:12:06,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. +59: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. +59: [2022-12-02 03:12:06,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt +59: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +56: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. +16: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. +56: [2022-12-02 03:12:06,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt +56: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +16: [2022-12-02 03:12:06,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt +16: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +13: [2022-12-02 03:12:06,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt +13: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +10: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. +57: [2022-12-02 03:12:06,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. +10: [2022-12-02 03:12:06,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt +57: [2022-12-02 03:12:06,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt +10: [2022-12-02 03:12:06,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +57: [2022-12-02 03:12:06,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +36: [2022-12-02 03:12:06,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. +36: [2022-12-02 03:12:06,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt +36: [2022-12-02 03:12:06,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +56: [2022-12-02 03:12:06,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. +61: [2022-12-02 03:12:06,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt +56: [2022-12-02 03:12:06,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt +61: [2022-12-02 03:12:06,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:06,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. +56: [2022-12-02 03:12:06,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:06,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt +61: [2022-12-02 03:12:06,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:06,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. +61: [2022-12-02 03:12:06,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt +61: [2022-12-02 03:12:06,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 9: [2022-12-02 03:12:06,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. + 9: [2022-12-02 03:12:06,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +60: [2022-12-02 03:12:06,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. +60: [2022-12-02 03:12:06,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt +60: [2022-12-02 03:12:06,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 9: [2022-12-02 03:12:06,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +48: [2022-12-02 03:12:06,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. +48: [2022-12-02 03:12:06,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt +48: [2022-12-02 03:12:06,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 9: [2022-12-02 03:12:06,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. + 9: [2022-12-02 03:12:06,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt + 9: [2022-12-02 03:12:06,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +20: [2022-12-02 03:12:06,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. +20: [2022-12-02 03:12:06,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt +20: [2022-12-02 03:12:06,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +59: [2022-12-02 03:12:06,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. +62: [2022-12-02 03:12:06,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. +62: [2022-12-02 03:12:06,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt +59: [2022-12-02 03:12:06,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt +62: [2022-12-02 03:12:06,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +59: [2022-12-02 03:12:06,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +47: [2022-12-02 03:12:06,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. +47: [2022-12-02 03:12:06,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt +47: [2022-12-02 03:12:06,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +21: [2022-12-02 03:12:06,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. +21: [2022-12-02 03:12:06,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt +21: [2022-12-02 03:12:06,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +42: [2022-12-02 03:12:06,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. +42: [2022-12-02 03:12:06,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt +42: [2022-12-02 03:12:06,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 3: [2022-12-02 03:12:06,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. + 3: [2022-12-02 03:12:06,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt + 3: [2022-12-02 03:12:06,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +45: [2022-12-02 03:12:06,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. +45: [2022-12-02 03:12:06,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt +45: [2022-12-02 03:12:06,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:06,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. + 6: [2022-12-02 03:12:06,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. + 6: [2022-12-02 03:12:06,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt + 6: [2022-12-02 03:12:06,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:06,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt +39: [2022-12-02 03:12:06,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:06,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. +41: [2022-12-02 03:12:06,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. +54: [2022-12-02 03:12:06,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. +54: [2022-12-02 03:12:06,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt +54: [2022-12-02 03:12:06,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +41: [2022-12-02 03:12:06,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. +41: [2022-12-02 03:12:06,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt +47: [2022-12-02 03:12:06,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. +41: [2022-12-02 03:12:06,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +41: [2022-12-02 03:12:06,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt +47: [2022-12-02 03:12:06,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt +41: [2022-12-02 03:12:06,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +47: [2022-12-02 03:12:06,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +54: [2022-12-02 03:12:06,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. +54: [2022-12-02 03:12:06,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt +54: [2022-12-02 03:12:06,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 3: [2022-12-02 03:12:06,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. + 3: [2022-12-02 03:12:06,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt + 3: [2022-12-02 03:12:06,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 3: [2022-12-02 03:12:06,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. + 3: [2022-12-02 03:12:06,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt + 3: [2022-12-02 03:12:06,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +21: [2022-12-02 03:12:06,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. +21: [2022-12-02 03:12:06,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt +43: [2022-12-02 03:12:06,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. +21: [2022-12-02 03:12:06,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +43: [2022-12-02 03:12:06,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt +43: [2022-12-02 03:12:06,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +16: [2022-12-02 03:12:06,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. +16: [2022-12-02 03:12:06,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt +16: [2022-12-02 03:12:06,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +55: [2022-12-02 03:12:06,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. +55: [2022-12-02 03:12:06,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt +55: [2022-12-02 03:12:06,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +55: [2022-12-02 03:12:06,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. +55: [2022-12-02 03:12:06,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt +55: [2022-12-02 03:12:06,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 9: [2022-12-02 03:12:06,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. +29: [2022-12-02 03:12:06,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. +47: [2022-12-02 03:12:06,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. + 9: [2022-12-02 03:12:06,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt +29: [2022-12-02 03:12:06,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt +47: [2022-12-02 03:12:06,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt + 9: [2022-12-02 03:12:06,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:06,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +47: [2022-12-02 03:12:06,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +56: [2022-12-02 03:12:06,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. +56: [2022-12-02 03:12:06,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt +56: [2022-12-02 03:12:06,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +48: [2022-12-02 03:12:06,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. +48: [2022-12-02 03:12:06,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt +48: [2022-12-02 03:12:06,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +13: [2022-12-02 03:12:06,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +13: [2022-12-02 03:12:06,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +13: [2022-12-02 03:12:06,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +44: [2022-12-02 03:12:06,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. +44: [2022-12-02 03:12:06,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. +44: [2022-12-02 03:12:06,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt +44: [2022-12-02 03:12:06,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt +44: [2022-12-02 03:12:06,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +44: [2022-12-02 03:12:06,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +43: [2022-12-02 03:12:06,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. +43: [2022-12-02 03:12:06,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt +43: [2022-12-02 03:12:06,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +15: [2022-12-02 03:12:06,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. +15: [2022-12-02 03:12:06,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. +15: [2022-12-02 03:12:06,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt +15: [2022-12-02 03:12:06,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt +15: [2022-12-02 03:12:06,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +15: [2022-12-02 03:12:06,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +10: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +10: [2022-12-02 03:12:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +37: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. +37: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. +10: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +37: [2022-12-02 03:12:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt +37: [2022-12-02 03:12:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt +37: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +37: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 8: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. + 8: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. + 8: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. + 8: [2022-12-02 03:12:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt + 8: [2022-12-02 03:12:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt + 8: [2022-12-02 03:12:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt + 8: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 8: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 8: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +55: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. +23: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. +23: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. +23: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. +55: [2022-12-02 03:12:06,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt +55: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +38: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. +38: [2022-12-02 03:12:06,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt +15: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. +38: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +15: [2022-12-02 03:12:06,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt +15: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +20: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. +20: [2022-12-02 03:12:06,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt +23: [2022-12-02 03:12:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt +23: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +23: [2022-12-02 03:12:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt +23: [2022-12-02 03:12:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt +20: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +23: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +23: [2022-12-02 03:12:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +20: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. +19: [2022-12-02 03:12:06,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. +20: [2022-12-02 03:12:06,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt +19: [2022-12-02 03:12:06,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt +20: [2022-12-02 03:12:06,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +19: [2022-12-02 03:12:06,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +21: [2022-12-02 03:12:06,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. +21: [2022-12-02 03:12:06,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt +21: [2022-12-02 03:12:06,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +62: [2022-12-02 03:12:06,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. +62: [2022-12-02 03:12:06,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt +62: [2022-12-02 03:12:06,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +11: [2022-12-02 03:12:06,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +11: [2022-12-02 03:12:06,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +11: [2022-12-02 03:12:06,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:06,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. +17: [2022-12-02 03:12:06,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. +17: [2022-12-02 03:12:06,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt +29: [2022-12-02 03:12:06,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt +17: [2022-12-02 03:12:06,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:06,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +28: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. +28: [2022-12-02 03:12:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt + 7: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +28: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 7: [2022-12-02 03:12:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +50: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. +50: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. +50: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. + 7: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +50: [2022-12-02 03:12:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt +50: [2022-12-02 03:12:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt +50: [2022-12-02 03:12:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt +50: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. +50: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt +50: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +28: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. +28: [2022-12-02 03:12:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt +28: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +50: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. +24: [2022-12-02 03:12:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. +50: [2022-12-02 03:12:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt +24: [2022-12-02 03:12:06,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt +50: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +24: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 7: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. + 7: [2022-12-02 03:12:06,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt +16: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. + 7: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 7: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +16: [2022-12-02 03:12:06,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt + 7: [2022-12-02 03:12:06,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +46: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. +46: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. +16: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 7: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +46: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. +46: [2022-12-02 03:12:06,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt +46: [2022-12-02 03:12:06,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt +59: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. +59: [2022-12-02 03:12:06,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt +62: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. +59: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +62: [2022-12-02 03:12:06,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt +62: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 6: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. + 6: [2022-12-02 03:12:06,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt + 6: [2022-12-02 03:12:06,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +46: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +46: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +46: [2022-12-02 03:12:06,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt +46: [2022-12-02 03:12:06,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +60: [2022-12-02 03:12:06,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. +60: [2022-12-02 03:12:06,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt +60: [2022-12-02 03:12:06,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +60: [2022-12-02 03:12:06,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. +60: [2022-12-02 03:12:06,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt +60: [2022-12-02 03:12:06,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +28: [2022-12-02 03:12:06,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. +28: [2022-12-02 03:12:06,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt +28: [2022-12-02 03:12:06,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +54: [2022-12-02 03:12:06,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. +48: [2022-12-02 03:12:06,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. +54: [2022-12-02 03:12:06,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt +54: [2022-12-02 03:12:06,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +48: [2022-12-02 03:12:06,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt +48: [2022-12-02 03:12:06,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 0: [2022-12-02 03:12:06,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. +41: [2022-12-02 03:12:06,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. + 0: [2022-12-02 03:12:06,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt +41: [2022-12-02 03:12:06,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt + 0: [2022-12-02 03:12:06,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +41: [2022-12-02 03:12:06,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +55: [2022-12-02 03:12:06,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. +55: [2022-12-02 03:12:06,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt +55: [2022-12-02 03:12:06,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +43: [2022-12-02 03:12:06,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. +43: [2022-12-02 03:12:06,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt +43: [2022-12-02 03:12:06,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +19: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. +19: [2022-12-02 03:12:06,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt +19: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. +53: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. +29: [2022-12-02 03:12:06,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt +53: [2022-12-02 03:12:06,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt +53: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. +53: [2022-12-02 03:12:06,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt +53: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +47: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. +47: [2022-12-02 03:12:06,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt +47: [2022-12-02 03:12:06,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +37: [2022-12-02 03:12:06,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. +37: [2022-12-02 03:12:06,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt +37: [2022-12-02 03:12:06,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +42: [2022-12-02 03:12:06,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. +42: [2022-12-02 03:12:06,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt +42: [2022-12-02 03:12:06,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +56: [2022-12-02 03:12:06,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. +56: [2022-12-02 03:12:06,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt +56: [2022-12-02 03:12:06,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:06,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +26: [2022-12-02 03:12:06,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. +17: [2022-12-02 03:12:06,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. +26: [2022-12-02 03:12:06,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt +17: [2022-12-02 03:12:06,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt +26: [2022-12-02 03:12:06,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +17: [2022-12-02 03:12:06,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +42: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. +42: [2022-12-02 03:12:06,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt +42: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. +53: [2022-12-02 03:12:06,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt +53: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. +63: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. +63: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. +63: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. + 1: [2022-12-02 03:12:06,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt + 1: [2022-12-02 03:12:06,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:06,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. + 1: [2022-12-02 03:12:06,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt + 1: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +38: [2022-12-02 03:12:06,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. +38: [2022-12-02 03:12:06,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt +38: [2022-12-02 03:12:06,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +12: [2022-12-02 03:12:06,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +12: [2022-12-02 03:12:06,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +12: [2022-12-02 03:12:06,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +49: [2022-12-02 03:12:06,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. +49: [2022-12-02 03:12:06,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. +49: [2022-12-02 03:12:06,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. +49: [2022-12-02 03:12:06,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. +49: [2022-12-02 03:12:06,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt +49: [2022-12-02 03:12:06,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt +49: [2022-12-02 03:12:06,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt +49: [2022-12-02 03:12:06,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt +49: [2022-12-02 03:12:06,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +49: [2022-12-02 03:12:06,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +49: [2022-12-02 03:12:06,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +49: [2022-12-02 03:12:06,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +36: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. +27: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. +27: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. +27: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. +36: [2022-12-02 03:12:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt +27: [2022-12-02 03:12:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt +36: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +12: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +27: [2022-12-02 03:12:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt +27: [2022-12-02 03:12:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt +12: [2022-12-02 03:12:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +27: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +27: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +27: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +12: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +12: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. +12: [2022-12-02 03:12:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt +12: [2022-12-02 03:12:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +36: [2022-12-02 03:12:06,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. +36: [2022-12-02 03:12:06,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt +36: [2022-12-02 03:12:06,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +25: [2022-12-02 03:12:06,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. +25: [2022-12-02 03:12:06,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt +25: [2022-12-02 03:12:06,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +25: [2022-12-02 03:12:06,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. +25: [2022-12-02 03:12:06,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt +25: [2022-12-02 03:12:06,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +19: [2022-12-02 03:12:06,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. +19: [2022-12-02 03:12:06,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt +19: [2022-12-02 03:12:06,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +57: [2022-12-02 03:12:06,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. +57: [2022-12-02 03:12:06,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. +57: [2022-12-02 03:12:06,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt +57: [2022-12-02 03:12:06,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt +57: [2022-12-02 03:12:06,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +57: [2022-12-02 03:12:06,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +44: [2022-12-02 03:12:06,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. +26: [2022-12-02 03:12:06,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. +41: [2022-12-02 03:12:06,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. +63: [2022-12-02 03:12:06,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt +44: [2022-12-02 03:12:06,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt +26: [2022-12-02 03:12:06,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt +41: [2022-12-02 03:12:06,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt +63: [2022-12-02 03:12:06,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt +63: [2022-12-02 03:12:06,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt +63: [2022-12-02 03:12:06,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt +44: [2022-12-02 03:12:06,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +26: [2022-12-02 03:12:06,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +41: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +14: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +14: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. +14: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +14: [2022-12-02 03:12:06,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +14: [2022-12-02 03:12:06,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt +14: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +37: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. +14: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +14: [2022-12-02 03:12:06,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +37: [2022-12-02 03:12:06,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt +14: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +37: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +14: [2022-12-02 03:12:06,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +14: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +14: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +25: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. +25: [2022-12-02 03:12:06,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt +25: [2022-12-02 03:12:06,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +51: [2022-12-02 03:12:06,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. +51: [2022-12-02 03:12:06,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. +51: [2022-12-02 03:12:06,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt +51: [2022-12-02 03:12:06,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt +51: [2022-12-02 03:12:06,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +51: [2022-12-02 03:12:06,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +14: [2022-12-02 03:12:06,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. +14: [2022-12-02 03:12:06,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt +14: [2022-12-02 03:12:06,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +30: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. +37: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. +30: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt +30: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 4: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. + 4: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. + 4: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +37: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt + 4: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. + 4: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +37: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 2: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +30: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. + 4: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt + 4: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt + 2: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. + 4: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 2: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +30: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt + 4: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt + 2: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt + 2: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 2: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +30: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 4: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 4: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 2: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt + 2: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 4: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 2: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +30: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. +30: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt +30: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +26: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. +26: [2022-12-02 03:12:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt +26: [2022-12-02 03:12:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 5: [2022-12-02 03:12:06,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. + 5: [2022-12-02 03:12:06,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt + 5: [2022-12-02 03:12:06,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +55: [2022-12-02 03:12:06,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. +55: [2022-12-02 03:12:06,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt +55: [2022-12-02 03:12:06,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:06,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt +61: [2022-12-02 03:12:06,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +51: [2022-12-02 03:12:06,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. +51: [2022-12-02 03:12:06,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt +51: [2022-12-02 03:12:06,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +51: [2022-12-02 03:12:06,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. +51: [2022-12-02 03:12:06,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt +51: [2022-12-02 03:12:06,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +31: [2022-12-02 03:12:06,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. +31: [2022-12-02 03:12:06,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt +31: [2022-12-02 03:12:06,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +22: [2022-12-02 03:12:06,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. +22: [2022-12-02 03:12:06,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. +22: [2022-12-02 03:12:06,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. +22: [2022-12-02 03:12:06,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt +22: [2022-12-02 03:12:06,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt +22: [2022-12-02 03:12:06,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt +22: [2022-12-02 03:12:06,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +22: [2022-12-02 03:12:06,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +22: [2022-12-02 03:12:06,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:06,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. + 1: [2022-12-02 03:12:06,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt + 1: [2022-12-02 03:12:06,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:06,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. +39: [2022-12-02 03:12:06,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt +39: [2022-12-02 03:12:06,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +57: [2022-12-02 03:12:06,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. +57: [2022-12-02 03:12:06,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt +57: [2022-12-02 03:12:06,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +18: [2022-12-02 03:12:06,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. +18: [2022-12-02 03:12:06,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt +18: [2022-12-02 03:12:06,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +41: [2022-12-02 03:12:06,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. +41: [2022-12-02 03:12:06,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt +41: [2022-12-02 03:12:06,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +18: [2022-12-02 03:12:06,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. +18: [2022-12-02 03:12:06,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt +18: [2022-12-02 03:12:06,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +31: [2022-12-02 03:12:06,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. +31: [2022-12-02 03:12:06,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. +31: [2022-12-02 03:12:06,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt +31: [2022-12-02 03:12:06,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt +31: [2022-12-02 03:12:06,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +31: [2022-12-02 03:12:06,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +33: [2022-12-02 03:12:06,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. +33: [2022-12-02 03:12:06,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt +33: [2022-12-02 03:12:06,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +35: [2022-12-02 03:12:06,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. +35: [2022-12-02 03:12:06,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. +35: [2022-12-02 03:12:06,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. +35: [2022-12-02 03:12:06,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt +35: [2022-12-02 03:12:06,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt +35: [2022-12-02 03:12:06,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt +35: [2022-12-02 03:12:06,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +35: [2022-12-02 03:12:06,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +35: [2022-12-02 03:12:06,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:06,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. +39: [2022-12-02 03:12:06,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. +39: [2022-12-02 03:12:06,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt +39: [2022-12-02 03:12:06,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt +39: [2022-12-02 03:12:06,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:06,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:06,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. +61: [2022-12-02 03:12:06,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt +61: [2022-12-02 03:12:06,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +42: [2022-12-02 03:12:06,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. +42: [2022-12-02 03:12:06,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt +42: [2022-12-02 03:12:06,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +18: [2022-12-02 03:12:06,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. +18: [2022-12-02 03:12:06,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt +18: [2022-12-02 03:12:06,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 5: [2022-12-02 03:12:06,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +48: [2022-12-02 03:12:06,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. +48: [2022-12-02 03:12:06,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt + 5: [2022-12-02 03:12:06,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +48: [2022-12-02 03:12:06,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 5: [2022-12-02 03:12:06,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 5: [2022-12-02 03:12:06,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. + 5: [2022-12-02 03:12:06,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt + 5: [2022-12-02 03:12:06,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +45: [2022-12-02 03:12:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. +45: [2022-12-02 03:12:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt +45: [2022-12-02 03:12:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +36: [2022-12-02 03:12:06,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. +36: [2022-12-02 03:12:06,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt +36: [2022-12-02 03:12:06,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +58: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. +58: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. +58: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. +58: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. +58: [2022-12-02 03:12:06,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt +58: [2022-12-02 03:12:06,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt +58: [2022-12-02 03:12:06,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt +11: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. +58: [2022-12-02 03:12:06,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt +58: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +58: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +58: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +58: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +11: [2022-12-02 03:12:06,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt +11: [2022-12-02 03:12:06,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +49: [2022-12-02 03:12:06,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. +49: [2022-12-02 03:12:06,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt +49: [2022-12-02 03:12:06,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 4: [2022-12-02 03:12:06,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. + 4: [2022-12-02 03:12:06,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt + 4: [2022-12-02 03:12:06,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:06,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. +39: [2022-12-02 03:12:06,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt +39: [2022-12-02 03:12:06,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +54: [2022-12-02 03:12:06,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. +54: [2022-12-02 03:12:06,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt +54: [2022-12-02 03:12:06,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +50: [2022-12-02 03:12:06,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. +50: [2022-12-02 03:12:06,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt +50: [2022-12-02 03:12:06,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +43: [2022-12-02 03:12:06,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. +43: [2022-12-02 03:12:06,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt +43: [2022-12-02 03:12:06,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +56: [2022-12-02 03:12:06,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. +56: [2022-12-02 03:12:06,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt +56: [2022-12-02 03:12:06,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +32: [2022-12-02 03:12:06,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. +32: [2022-12-02 03:12:06,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt +32: [2022-12-02 03:12:06,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. +52: [2022-12-02 03:12:06,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt +52: [2022-12-02 03:12:06,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +52: [2022-12-02 03:12:06,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt +52: [2022-12-02 03:12:06,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt +52: [2022-12-02 03:12:06,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +52: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +24: [2022-12-02 03:12:06,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. +24: [2022-12-02 03:12:06,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt +24: [2022-12-02 03:12:06,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +33: [2022-12-02 03:12:06,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. +33: [2022-12-02 03:12:06,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt +33: [2022-12-02 03:12:06,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +24: [2022-12-02 03:12:06,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. +24: [2022-12-02 03:12:06,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt +24: [2022-12-02 03:12:06,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +59: [2022-12-02 03:12:06,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. +59: [2022-12-02 03:12:06,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt +59: [2022-12-02 03:12:06,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. +40: [2022-12-02 03:12:06,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt +40: [2022-12-02 03:12:06,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt +40: [2022-12-02 03:12:06,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt +40: [2022-12-02 03:12:06,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt +40: [2022-12-02 03:12:06,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +40: [2022-12-02 03:12:06,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 8: [2022-12-02 03:12:06,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. + 8: [2022-12-02 03:12:06,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt + 8: [2022-12-02 03:12:06,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +47: [2022-12-02 03:12:06,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. +47: [2022-12-02 03:12:06,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt +47: [2022-12-02 03:12:06,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +28: [2022-12-02 03:12:06,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. +28: [2022-12-02 03:12:06,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt +28: [2022-12-02 03:12:06,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:06,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. +17: [2022-12-02 03:12:06,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. +17: [2022-12-02 03:12:06,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt +17: [2022-12-02 03:12:06,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 3: [2022-12-02 03:12:06,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. + 3: [2022-12-02 03:12:06,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt + 3: [2022-12-02 03:12:06,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +35: [2022-12-02 03:12:06,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. +35: [2022-12-02 03:12:06,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt +35: [2022-12-02 03:12:06,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +60: [2022-12-02 03:12:06,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. +30: [2022-12-02 03:12:06,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. +30: [2022-12-02 03:12:06,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt +30: [2022-12-02 03:12:06,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 6: [2022-12-02 03:12:06,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. +60: [2022-12-02 03:12:06,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt +63: [2022-12-02 03:12:06,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt + 6: [2022-12-02 03:12:06,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt +60: [2022-12-02 03:12:06,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:06,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 6: [2022-12-02 03:12:06,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +44: [2022-12-02 03:12:06,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. +34: [2022-12-02 03:12:06,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. +34: [2022-12-02 03:12:06,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. +34: [2022-12-02 03:12:06,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. +34: [2022-12-02 03:12:06,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. +44: [2022-12-02 03:12:06,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt +34: [2022-12-02 03:12:06,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt +34: [2022-12-02 03:12:06,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt +34: [2022-12-02 03:12:06,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt +34: [2022-12-02 03:12:06,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt +44: [2022-12-02 03:12:06,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +34: [2022-12-02 03:12:06,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +34: [2022-12-02 03:12:06,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +34: [2022-12-02 03:12:06,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +34: [2022-12-02 03:12:06,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +33: [2022-12-02 03:12:06,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. +33: [2022-12-02 03:12:06,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt +33: [2022-12-02 03:12:06,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +62: [2022-12-02 03:12:06,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. +62: [2022-12-02 03:12:06,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt +62: [2022-12-02 03:12:06,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 7: [2022-12-02 03:12:06,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. + 7: [2022-12-02 03:12:06,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt + 7: [2022-12-02 03:12:06,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +10: [2022-12-02 03:12:06,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. +10: [2022-12-02 03:12:06,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt +10: [2022-12-02 03:12:06,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 9: [2022-12-02 03:12:06,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. + 9: [2022-12-02 03:12:06,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt + 9: [2022-12-02 03:12:06,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +34: [2022-12-02 03:12:06,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. +34: [2022-12-02 03:12:06,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt +34: [2022-12-02 03:12:06,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +46: [2022-12-02 03:12:06,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. +46: [2022-12-02 03:12:06,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt +46: [2022-12-02 03:12:06,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +51: [2022-12-02 03:12:06,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. +51: [2022-12-02 03:12:06,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt +51: [2022-12-02 03:12:06,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +58: [2022-12-02 03:12:06,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. +58: [2022-12-02 03:12:06,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt +58: [2022-12-02 03:12:06,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +23: [2022-12-02 03:12:06,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. +23: [2022-12-02 03:12:06,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt +23: [2022-12-02 03:12:06,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +15: [2022-12-02 03:12:06,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +15: [2022-12-02 03:12:06,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +15: [2022-12-02 03:12:06,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:06,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. +29: [2022-12-02 03:12:06,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt +29: [2022-12-02 03:12:06,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +21: [2022-12-02 03:12:06,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. +21: [2022-12-02 03:12:06,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt +21: [2022-12-02 03:12:06,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +13: [2022-12-02 03:12:06,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. +13: [2022-12-02 03:12:06,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt +13: [2022-12-02 03:12:06,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +26: [2022-12-02 03:12:06,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. +26: [2022-12-02 03:12:06,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt +26: [2022-12-02 03:12:06,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +22: [2022-12-02 03:12:06,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. +22: [2022-12-02 03:12:06,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt +22: [2022-12-02 03:12:06,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +12: [2022-12-02 03:12:06,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. +12: [2022-12-02 03:12:06,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt +12: [2022-12-02 03:12:06,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +11: [2022-12-02 03:12:06,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +11: [2022-12-02 03:12:06,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +11: [2022-12-02 03:12:06,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +27: [2022-12-02 03:12:06,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. +27: [2022-12-02 03:12:06,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt +27: [2022-12-02 03:12:06,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +31: [2022-12-02 03:12:06,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. +31: [2022-12-02 03:12:06,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt +31: [2022-12-02 03:12:06,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +25: [2022-12-02 03:12:06,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. +25: [2022-12-02 03:12:06,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt +25: [2022-12-02 03:12:06,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +19: [2022-12-02 03:12:06,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. +19: [2022-12-02 03:12:06,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt +19: [2022-12-02 03:12:06,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 2: [2022-12-02 03:12:06,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. + 2: [2022-12-02 03:12:06,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt + 2: [2022-12-02 03:12:06,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 5: [2022-12-02 03:12:06,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. + 5: [2022-12-02 03:12:06,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt + 5: [2022-12-02 03:12:06,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +20: [2022-12-02 03:12:06,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. +20: [2022-12-02 03:12:06,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt +20: [2022-12-02 03:12:06,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +38: [2022-12-02 03:12:07,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. +38: [2022-12-02 03:12:07,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt +38: [2022-12-02 03:12:07,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 0: [2022-12-02 03:12:07,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. + 0: [2022-12-02 03:12:07,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt + 0: [2022-12-02 03:12:07,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +16: [2022-12-02 03:12:07,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. +16: [2022-12-02 03:12:07,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt +16: [2022-12-02 03:12:07,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +40: [2022-12-02 03:12:07,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. +40: [2022-12-02 03:12:07,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt +40: [2022-12-02 03:12:07,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +55: [2022-12-02 03:12:07,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. +55: [2022-12-02 03:12:07,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt +55: [2022-12-02 03:12:07,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +57: [2022-12-02 03:12:07,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. +57: [2022-12-02 03:12:07,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt +57: [2022-12-02 03:12:07,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +18: [2022-12-02 03:12:07,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. +18: [2022-12-02 03:12:07,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt +18: [2022-12-02 03:12:07,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +24: [2022-12-02 03:12:07,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. +24: [2022-12-02 03:12:07,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt +24: [2022-12-02 03:12:07,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:07,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. +53: [2022-12-02 03:12:07,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt +53: [2022-12-02 03:12:07,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +56: [2022-12-02 03:12:07,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. +56: [2022-12-02 03:12:07,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt +56: [2022-12-02 03:12:07,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +14: [2022-12-02 03:12:07,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. +14: [2022-12-02 03:12:07,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt +14: [2022-12-02 03:12:07,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +37: [2022-12-02 03:12:07,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. +37: [2022-12-02 03:12:07,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt +37: [2022-12-02 03:12:07,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +52: [2022-12-02 03:12:07,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. +52: [2022-12-02 03:12:07,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt +52: [2022-12-02 03:12:07,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 4: [2022-12-02 03:12:07,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. + 4: [2022-12-02 03:12:07,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt + 4: [2022-12-02 03:12:07,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +41: [2022-12-02 03:12:07,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. +41: [2022-12-02 03:12:07,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt +41: [2022-12-02 03:12:07,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +49: [2022-12-02 03:12:07,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. +49: [2022-12-02 03:12:07,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt +49: [2022-12-02 03:12:07,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +50: [2022-12-02 03:12:07,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. +50: [2022-12-02 03:12:07,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt +50: [2022-12-02 03:12:07,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:07,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. +43: [2022-12-02 03:12:07,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. +43: [2022-12-02 03:12:07,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt +43: [2022-12-02 03:12:07,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:07,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt +48: [2022-12-02 03:12:07,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. +39: [2022-12-02 03:12:07,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +48: [2022-12-02 03:12:07,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt +42: [2022-12-02 03:12:07,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. +48: [2022-12-02 03:12:07,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +42: [2022-12-02 03:12:07,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt +42: [2022-12-02 03:12:07,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +32: [2022-12-02 03:12:07,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. +32: [2022-12-02 03:12:07,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt +32: [2022-12-02 03:12:07,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +28: [2022-12-02 03:12:07,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. +28: [2022-12-02 03:12:07,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt +28: [2022-12-02 03:12:07,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 6: [2022-12-02 03:12:07,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. + 6: [2022-12-02 03:12:07,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt + 6: [2022-12-02 03:12:07,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +33: [2022-12-02 03:12:07,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. +33: [2022-12-02 03:12:07,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt +33: [2022-12-02 03:12:07,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +59: [2022-12-02 03:12:07,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. +59: [2022-12-02 03:12:07,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt +59: [2022-12-02 03:12:07,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +10: [2022-12-02 03:12:07,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. +10: [2022-12-02 03:12:07,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt +10: [2022-12-02 03:12:07,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +35: [2022-12-02 03:12:07,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. +35: [2022-12-02 03:12:07,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt +35: [2022-12-02 03:12:07,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +34: [2022-12-02 03:12:07,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. +34: [2022-12-02 03:12:07,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt +34: [2022-12-02 03:12:07,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +36: [2022-12-02 03:12:07,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. +36: [2022-12-02 03:12:07,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt +36: [2022-12-02 03:12:07,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +23: [2022-12-02 03:12:07,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. +23: [2022-12-02 03:12:07,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt +23: [2022-12-02 03:12:07,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +17: [2022-12-02 03:12:07,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. +17: [2022-12-02 03:12:07,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt +17: [2022-12-02 03:12:07,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:07,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. +51: [2022-12-02 03:12:07,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. +63: [2022-12-02 03:12:07,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt +63: [2022-12-02 03:12:07,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 3: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. + 3: [2022-12-02 03:12:07,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt + 3: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. + 8: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. + 8: [2022-12-02 03:12:07,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt + 8: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +51: [2022-12-02 03:12:07,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt +51: [2022-12-02 03:12:07,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +60: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. +61: [2022-12-02 03:12:07,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt +61: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +54: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. +60: [2022-12-02 03:12:07,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt +54: [2022-12-02 03:12:07,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt +60: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +54: [2022-12-02 03:12:07,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +11: [2022-12-02 03:12:07,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. +11: [2022-12-02 03:12:07,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt +11: [2022-12-02 03:12:07,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +62: [2022-12-02 03:12:07,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. +62: [2022-12-02 03:12:07,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt +47: [2022-12-02 03:12:07,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. +62: [2022-12-02 03:12:07,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +47: [2022-12-02 03:12:07,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt +47: [2022-12-02 03:12:07,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 9: [2022-12-02 03:12:07,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. + 9: [2022-12-02 03:12:07,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt + 9: [2022-12-02 03:12:07,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 7: [2022-12-02 03:12:07,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. + 7: [2022-12-02 03:12:07,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt + 7: [2022-12-02 03:12:07,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +26: [2022-12-02 03:12:07,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. +26: [2022-12-02 03:12:07,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt +26: [2022-12-02 03:12:07,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +46: [2022-12-02 03:12:07,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. +46: [2022-12-02 03:12:07,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt +46: [2022-12-02 03:12:07,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:07,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. +29: [2022-12-02 03:12:07,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt +29: [2022-12-02 03:12:07,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +30: [2022-12-02 03:12:07,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. +30: [2022-12-02 03:12:07,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt +30: [2022-12-02 03:12:07,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +13: [2022-12-02 03:12:07,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +13: [2022-12-02 03:12:07,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +13: [2022-12-02 03:12:07,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +45: [2022-12-02 03:12:07,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. +45: [2022-12-02 03:12:07,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt +45: [2022-12-02 03:12:07,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +27: [2022-12-02 03:12:07,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. +27: [2022-12-02 03:12:07,086] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt +27: [2022-12-02 03:12:07,086] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +44: [2022-12-02 03:12:07,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. +44: [2022-12-02 03:12:07,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt +44: [2022-12-02 03:12:07,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +12: [2022-12-02 03:12:07,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. +12: [2022-12-02 03:12:07,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt +12: [2022-12-02 03:12:07,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +15: [2022-12-02 03:12:07,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. +15: [2022-12-02 03:12:07,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt +15: [2022-12-02 03:12:07,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:07,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. + 2: [2022-12-02 03:12:07,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. + 2: [2022-12-02 03:12:07,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt + 2: [2022-12-02 03:12:07,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +25: [2022-12-02 03:12:07,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. +25: [2022-12-02 03:12:07,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt +25: [2022-12-02 03:12:07,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +21: [2022-12-02 03:12:07,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. +21: [2022-12-02 03:12:07,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt +21: [2022-12-02 03:12:07,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:07,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt +58: [2022-12-02 03:12:07,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. + 1: [2022-12-02 03:12:07,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +58: [2022-12-02 03:12:07,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt +58: [2022-12-02 03:12:07,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +20: [2022-12-02 03:12:07,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. +20: [2022-12-02 03:12:07,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt +20: [2022-12-02 03:12:07,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +31: [2022-12-02 03:12:07,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. +31: [2022-12-02 03:12:07,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt +31: [2022-12-02 03:12:07,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +38: [2022-12-02 03:12:07,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. +38: [2022-12-02 03:12:07,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt +38: [2022-12-02 03:12:07,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 0: [2022-12-02 03:12:07,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. + 0: [2022-12-02 03:12:07,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt + 0: [2022-12-02 03:12:07,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +22: [2022-12-02 03:12:07,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. + 5: [2022-12-02 03:12:07,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +22: [2022-12-02 03:12:07,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt + 5: [2022-12-02 03:12:07,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +22: [2022-12-02 03:12:07,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 5: [2022-12-02 03:12:07,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +40: [2022-12-02 03:12:07,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. +40: [2022-12-02 03:12:07,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt +40: [2022-12-02 03:12:07,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +55: [2022-12-02 03:12:07,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. +55: [2022-12-02 03:12:07,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt +55: [2022-12-02 03:12:07,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +57: [2022-12-02 03:12:07,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. +57: [2022-12-02 03:12:07,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt +57: [2022-12-02 03:12:07,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +18: [2022-12-02 03:12:07,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. +18: [2022-12-02 03:12:07,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt +18: [2022-12-02 03:12:07,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:07,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. +16: [2022-12-02 03:12:07,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. +16: [2022-12-02 03:12:07,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt +16: [2022-12-02 03:12:07,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:07,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt +53: [2022-12-02 03:12:07,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +56: [2022-12-02 03:12:07,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. + 4: [2022-12-02 03:12:07,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. + 4: [2022-12-02 03:12:07,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt + 4: [2022-12-02 03:12:07,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +24: [2022-12-02 03:12:07,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. +24: [2022-12-02 03:12:07,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt +24: [2022-12-02 03:12:07,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +41: [2022-12-02 03:12:07,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. +41: [2022-12-02 03:12:07,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt +41: [2022-12-02 03:12:07,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +56: [2022-12-02 03:12:07,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt +56: [2022-12-02 03:12:07,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +19: [2022-12-02 03:12:07,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. +19: [2022-12-02 03:12:07,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt +19: [2022-12-02 03:12:07,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:07,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +37: [2022-12-02 03:12:07,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. +37: [2022-12-02 03:12:07,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt +37: [2022-12-02 03:12:07,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +14: [2022-12-02 03:12:07,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. +14: [2022-12-02 03:12:07,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt +14: [2022-12-02 03:12:07,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +50: [2022-12-02 03:12:07,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. +62: [2022-12-02 03:12:07,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. +50: [2022-12-02 03:12:07,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt +62: [2022-12-02 03:12:07,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt +50: [2022-12-02 03:12:07,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +62: [2022-12-02 03:12:07,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +32: [2022-12-02 03:12:07,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. +32: [2022-12-02 03:12:07,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt +32: [2022-12-02 03:12:07,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +48: [2022-12-02 03:12:07,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. +48: [2022-12-02 03:12:07,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt +48: [2022-12-02 03:12:07,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +17: [2022-12-02 03:12:07,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. +17: [2022-12-02 03:12:07,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt +17: [2022-12-02 03:12:07,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +47: [2022-12-02 03:12:07,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. +47: [2022-12-02 03:12:07,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt +47: [2022-12-02 03:12:07,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +34: [2022-12-02 03:12:07,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. +34: [2022-12-02 03:12:07,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt +34: [2022-12-02 03:12:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +42: [2022-12-02 03:12:07,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. +42: [2022-12-02 03:12:07,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt +42: [2022-12-02 03:12:07,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +43: [2022-12-02 03:12:07,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. +43: [2022-12-02 03:12:07,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt +43: [2022-12-02 03:12:07,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 6: [2022-12-02 03:12:07,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +44: [2022-12-02 03:12:07,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. + 6: [2022-12-02 03:12:07,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +44: [2022-12-02 03:12:07,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt +44: [2022-12-02 03:12:07,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 6: [2022-12-02 03:12:07,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +35: [2022-12-02 03:12:07,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. +35: [2022-12-02 03:12:07,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt +35: [2022-12-02 03:12:07,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:07,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. +63: [2022-12-02 03:12:07,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt +63: [2022-12-02 03:12:07,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:07,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. +39: [2022-12-02 03:12:07,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt +39: [2022-12-02 03:12:07,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:07,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt + 1: [2022-12-02 03:12:07,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +33: [2022-12-02 03:12:07,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. +33: [2022-12-02 03:12:07,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt +33: [2022-12-02 03:12:07,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +28: [2022-12-02 03:12:07,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. +28: [2022-12-02 03:12:07,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt +28: [2022-12-02 03:12:07,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +60: [2022-12-02 03:12:07,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. +60: [2022-12-02 03:12:07,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt +60: [2022-12-02 03:12:07,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:07,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. +61: [2022-12-02 03:12:07,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt +61: [2022-12-02 03:12:07,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 3: [2022-12-02 03:12:07,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. + 3: [2022-12-02 03:12:07,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt + 3: [2022-12-02 03:12:07,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +49: [2022-12-02 03:12:07,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. +49: [2022-12-02 03:12:07,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt +49: [2022-12-02 03:12:07,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 9: [2022-12-02 03:12:07,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. + 9: [2022-12-02 03:12:07,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt + 9: [2022-12-02 03:12:07,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +54: [2022-12-02 03:12:07,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. +54: [2022-12-02 03:12:07,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt +54: [2022-12-02 03:12:07,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +46: [2022-12-02 03:12:07,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. +46: [2022-12-02 03:12:07,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt +46: [2022-12-02 03:12:07,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +13: [2022-12-02 03:12:07,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. +13: [2022-12-02 03:12:07,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt +13: [2022-12-02 03:12:07,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +30: [2022-12-02 03:12:07,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. +30: [2022-12-02 03:12:07,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt +30: [2022-12-02 03:12:07,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +52: [2022-12-02 03:12:07,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. +52: [2022-12-02 03:12:07,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt +52: [2022-12-02 03:12:07,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 8: [2022-12-02 03:12:07,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. + 8: [2022-12-02 03:12:07,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt + 8: [2022-12-02 03:12:07,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +21: [2022-12-02 03:12:07,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. +21: [2022-12-02 03:12:07,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt +21: [2022-12-02 03:12:07,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 7: [2022-12-02 03:12:07,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. + 7: [2022-12-02 03:12:07,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt + 7: [2022-12-02 03:12:07,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +59: [2022-12-02 03:12:07,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. +59: [2022-12-02 03:12:07,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt +59: [2022-12-02 03:12:07,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +23: [2022-12-02 03:12:07,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. +23: [2022-12-02 03:12:07,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt +23: [2022-12-02 03:12:07,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:07,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. +29: [2022-12-02 03:12:07,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt +29: [2022-12-02 03:12:07,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +12: [2022-12-02 03:12:07,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +12: [2022-12-02 03:12:07,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +12: [2022-12-02 03:12:07,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +11: [2022-12-02 03:12:07,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +11: [2022-12-02 03:12:07,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +11: [2022-12-02 03:12:07,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +26: [2022-12-02 03:12:07,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. +26: [2022-12-02 03:12:07,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt +26: [2022-12-02 03:12:07,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +45: [2022-12-02 03:12:07,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. +45: [2022-12-02 03:12:07,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt +45: [2022-12-02 03:12:07,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +51: [2022-12-02 03:12:07,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. +51: [2022-12-02 03:12:07,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt +51: [2022-12-02 03:12:07,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +58: [2022-12-02 03:12:07,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. +58: [2022-12-02 03:12:07,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt +58: [2022-12-02 03:12:07,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +15: [2022-12-02 03:12:07,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +15: [2022-12-02 03:12:07,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +15: [2022-12-02 03:12:07,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +36: [2022-12-02 03:12:07,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. +36: [2022-12-02 03:12:07,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt +36: [2022-12-02 03:12:07,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +20: [2022-12-02 03:12:07,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. +20: [2022-12-02 03:12:07,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt +20: [2022-12-02 03:12:07,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +53: [2022-12-02 03:12:07,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. +53: [2022-12-02 03:12:07,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt +53: [2022-12-02 03:12:07,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +48: [2022-12-02 03:12:07,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. +48: [2022-12-02 03:12:07,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt +48: [2022-12-02 03:12:07,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +41: [2022-12-02 03:12:07,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. +41: [2022-12-02 03:12:07,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt +41: [2022-12-02 03:12:07,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +55: [2022-12-02 03:12:07,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. +55: [2022-12-02 03:12:07,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt +55: [2022-12-02 03:12:07,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +37: [2022-12-02 03:12:07,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. +37: [2022-12-02 03:12:07,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt +37: [2022-12-02 03:12:07,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +45: [2022-12-02 03:12:07,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. +45: [2022-12-02 03:12:07,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt +45: [2022-12-02 03:12:07,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +38: [2022-12-02 03:12:07,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. +38: [2022-12-02 03:12:07,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt +38: [2022-12-02 03:12:07,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +14: [2022-12-02 03:12:07,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +14: [2022-12-02 03:12:07,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +14: [2022-12-02 03:12:07,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +43: [2022-12-02 03:12:07,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. +43: [2022-12-02 03:12:07,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt +43: [2022-12-02 03:12:07,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 2: [2022-12-02 03:12:07,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. + 2: [2022-12-02 03:12:07,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt + 2: [2022-12-02 03:12:07,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:07,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. +57: [2022-12-02 03:12:07,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. +57: [2022-12-02 03:12:07,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt +57: [2022-12-02 03:12:07,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 4: [2022-12-02 03:12:07,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. + 4: [2022-12-02 03:12:07,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt + 4: [2022-12-02 03:12:07,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +42: [2022-12-02 03:12:07,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. +42: [2022-12-02 03:12:07,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt +42: [2022-12-02 03:12:07,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 0: [2022-12-02 03:12:07,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. + 0: [2022-12-02 03:12:07,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt + 0: [2022-12-02 03:12:07,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +27: [2022-12-02 03:12:07,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. +27: [2022-12-02 03:12:07,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt +27: [2022-12-02 03:12:07,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +32: [2022-12-02 03:12:07,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. +32: [2022-12-02 03:12:07,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt +32: [2022-12-02 03:12:07,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +54: [2022-12-02 03:12:07,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. + 6: [2022-12-02 03:12:07,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. +18: [2022-12-02 03:12:07,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. + 6: [2022-12-02 03:12:07,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt +54: [2022-12-02 03:12:07,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt + 6: [2022-12-02 03:12:07,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +54: [2022-12-02 03:12:07,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +18: [2022-12-02 03:12:07,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt +18: [2022-12-02 03:12:07,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +50: [2022-12-02 03:12:07,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. +40: [2022-12-02 03:12:07,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. +50: [2022-12-02 03:12:07,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt +40: [2022-12-02 03:12:07,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt +50: [2022-12-02 03:12:07,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +40: [2022-12-02 03:12:07,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +31: [2022-12-02 03:12:07,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. +49: [2022-12-02 03:12:07,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. +31: [2022-12-02 03:12:07,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt +49: [2022-12-02 03:12:07,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt +31: [2022-12-02 03:12:07,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +49: [2022-12-02 03:12:07,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +59: [2022-12-02 03:12:07,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. +59: [2022-12-02 03:12:07,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt +59: [2022-12-02 03:12:07,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +39: [2022-12-02 03:12:07,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. +22: [2022-12-02 03:12:07,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. +39: [2022-12-02 03:12:07,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt +22: [2022-12-02 03:12:07,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt +39: [2022-12-02 03:12:07,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +22: [2022-12-02 03:12:07,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +52: [2022-12-02 03:12:07,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. +52: [2022-12-02 03:12:07,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt +33: [2022-12-02 03:12:07,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. +52: [2022-12-02 03:12:07,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +33: [2022-12-02 03:12:07,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt +33: [2022-12-02 03:12:07,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +28: [2022-12-02 03:12:07,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. +28: [2022-12-02 03:12:07,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt +28: [2022-12-02 03:12:07,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:07,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. + 9: [2022-12-02 03:12:07,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. + 9: [2022-12-02 03:12:07,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt + 9: [2022-12-02 03:12:07,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +61: [2022-12-02 03:12:07,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt +61: [2022-12-02 03:12:07,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +47: [2022-12-02 03:12:07,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. +47: [2022-12-02 03:12:07,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt +47: [2022-12-02 03:12:07,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +24: [2022-12-02 03:12:07,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. +24: [2022-12-02 03:12:07,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt +24: [2022-12-02 03:12:07,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +36: [2022-12-02 03:12:07,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. +36: [2022-12-02 03:12:07,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt +36: [2022-12-02 03:12:07,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +34: [2022-12-02 03:12:07,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. +34: [2022-12-02 03:12:07,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt +34: [2022-12-02 03:12:07,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:07,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt + 1: [2022-12-02 03:12:07,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +56: [2022-12-02 03:12:07,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. +56: [2022-12-02 03:12:07,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt +56: [2022-12-02 03:12:07,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +10: [2022-12-02 03:12:07,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. +10: [2022-12-02 03:12:07,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt +10: [2022-12-02 03:12:07,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +16: [2022-12-02 03:12:07,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. +16: [2022-12-02 03:12:07,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt +16: [2022-12-02 03:12:07,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +60: [2022-12-02 03:12:07,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. +60: [2022-12-02 03:12:07,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt +60: [2022-12-02 03:12:07,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +17: [2022-12-02 03:12:07,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. +17: [2022-12-02 03:12:07,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt +17: [2022-12-02 03:12:07,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +19: [2022-12-02 03:12:07,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. +19: [2022-12-02 03:12:07,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt + 5: [2022-12-02 03:12:07,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. +19: [2022-12-02 03:12:07,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 5: [2022-12-02 03:12:07,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt + 5: [2022-12-02 03:12:07,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +25: [2022-12-02 03:12:07,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. +25: [2022-12-02 03:12:07,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt +25: [2022-12-02 03:12:07,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 8: [2022-12-02 03:12:07,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. + 8: [2022-12-02 03:12:07,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt + 8: [2022-12-02 03:12:07,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +51: [2022-12-02 03:12:07,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. +51: [2022-12-02 03:12:07,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt +46: [2022-12-02 03:12:07,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. +51: [2022-12-02 03:12:07,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +46: [2022-12-02 03:12:07,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt +46: [2022-12-02 03:12:07,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +62: [2022-12-02 03:12:07,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. +62: [2022-12-02 03:12:07,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt +62: [2022-12-02 03:12:07,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +30: [2022-12-02 03:12:07,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. +30: [2022-12-02 03:12:07,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt +30: [2022-12-02 03:12:07,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +35: [2022-12-02 03:12:07,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. +35: [2022-12-02 03:12:07,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt +35: [2022-12-02 03:12:07,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +63: [2022-12-02 03:12:07,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. +63: [2022-12-02 03:12:07,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt +63: [2022-12-02 03:12:07,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 7: [2022-12-02 03:12:07,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. + 7: [2022-12-02 03:12:07,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt + 7: [2022-12-02 03:12:07,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +10: [2022-12-02 03:12:07,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +10: [2022-12-02 03:12:07,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +10: [2022-12-02 03:12:07,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +23: [2022-12-02 03:12:07,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. +23: [2022-12-02 03:12:07,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt +23: [2022-12-02 03:12:07,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +44: [2022-12-02 03:12:07,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. +44: [2022-12-02 03:12:07,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt +44: [2022-12-02 03:12:07,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +21: [2022-12-02 03:12:07,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. +21: [2022-12-02 03:12:07,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt +21: [2022-12-02 03:12:07,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:07,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. +29: [2022-12-02 03:12:07,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt +29: [2022-12-02 03:12:07,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +58: [2022-12-02 03:12:07,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. +58: [2022-12-02 03:12:07,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt +58: [2022-12-02 03:12:07,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +13: [2022-12-02 03:12:07,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +13: [2022-12-02 03:12:07,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +13: [2022-12-02 03:12:07,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +15: [2022-12-02 03:12:07,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +15: [2022-12-02 03:12:07,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +15: [2022-12-02 03:12:07,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +26: [2022-12-02 03:12:07,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. +26: [2022-12-02 03:12:07,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt +26: [2022-12-02 03:12:07,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +22: [2022-12-02 03:12:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. +22: [2022-12-02 03:12:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt +22: [2022-12-02 03:12:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +11: [2022-12-02 03:12:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. +11: [2022-12-02 03:12:07,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt +11: [2022-12-02 03:12:07,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 3: [2022-12-02 03:12:07,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. + 3: [2022-12-02 03:12:07,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt + 3: [2022-12-02 03:12:07,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +25: [2022-12-02 03:12:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. +25: [2022-12-02 03:12:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt +25: [2022-12-02 03:12:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +12: [2022-12-02 03:12:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. +12: [2022-12-02 03:12:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt +12: [2022-12-02 03:12:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +27: [2022-12-02 03:12:07,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. +27: [2022-12-02 03:12:07,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt +27: [2022-12-02 03:12:07,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +25: [2022-12-02 03:12:07,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. +25: [2022-12-02 03:12:07,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt +25: [2022-12-02 03:12:07,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +13: [2022-12-02 03:12:07,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +13: [2022-12-02 03:12:07,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +13: [2022-12-02 03:12:07,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +28: [2022-12-02 03:12:07,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. +28: [2022-12-02 03:12:07,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt +28: [2022-12-02 03:12:07,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +35: [2022-12-02 03:12:07,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. +35: [2022-12-02 03:12:07,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt +35: [2022-12-02 03:12:07,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +15: [2022-12-02 03:12:07,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +15: [2022-12-02 03:12:07,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +15: [2022-12-02 03:12:07,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 2: [2022-12-02 03:12:07,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. + 2: [2022-12-02 03:12:07,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt + 2: [2022-12-02 03:12:07,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 8: [2022-12-02 03:12:07,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. + 8: [2022-12-02 03:12:07,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt + 8: [2022-12-02 03:12:07,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +26: [2022-12-02 03:12:07,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. +26: [2022-12-02 03:12:07,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt +26: [2022-12-02 03:12:07,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +19: [2022-12-02 03:12:07,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. +19: [2022-12-02 03:12:07,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. +19: [2022-12-02 03:12:07,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt +19: [2022-12-02 03:12:07,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt +19: [2022-12-02 03:12:07,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +19: [2022-12-02 03:12:07,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +23: [2022-12-02 03:12:07,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. +23: [2022-12-02 03:12:07,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt +23: [2022-12-02 03:12:07,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:07,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. + 1: [2022-12-02 03:12:07,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt + 1: [2022-12-02 03:12:07,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +30: [2022-12-02 03:12:07,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. +30: [2022-12-02 03:12:07,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt +30: [2022-12-02 03:12:07,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +60: [2022-12-02 03:12:07,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. +60: [2022-12-02 03:12:07,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt +60: [2022-12-02 03:12:07,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +27: [2022-12-02 03:12:07,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. +44: [2022-12-02 03:12:07,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. +27: [2022-12-02 03:12:07,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt +44: [2022-12-02 03:12:07,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt +27: [2022-12-02 03:12:07,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +44: [2022-12-02 03:12:07,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +21: [2022-12-02 03:12:07,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. +21: [2022-12-02 03:12:07,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt +21: [2022-12-02 03:12:07,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 2: [2022-12-02 03:12:07,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. + 2: [2022-12-02 03:12:07,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt + 2: [2022-12-02 03:12:07,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 3: [2022-12-02 03:12:07,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. + 3: [2022-12-02 03:12:07,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt + 3: [2022-12-02 03:12:07,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +22: [2022-12-02 03:12:07,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. +22: [2022-12-02 03:12:07,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt +22: [2022-12-02 03:12:07,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +31: [2022-12-02 03:12:07,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. +31: [2022-12-02 03:12:07,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt +31: [2022-12-02 03:12:07,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +11: [2022-12-02 03:12:07,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. +11: [2022-12-02 03:12:07,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt +11: [2022-12-02 03:12:07,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +12: [2022-12-02 03:12:07,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +12: [2022-12-02 03:12:07,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +12: [2022-12-02 03:12:07,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +31: [2022-12-02 03:12:07,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. +31: [2022-12-02 03:12:07,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt +31: [2022-12-02 03:12:07,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 9: [2022-12-02 03:12:07,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. + 9: [2022-12-02 03:12:07,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt + 9: [2022-12-02 03:12:07,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +46: [2022-12-02 03:12:07,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. +46: [2022-12-02 03:12:07,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt +46: [2022-12-02 03:12:07,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +18: [2022-12-02 03:12:07,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. +18: [2022-12-02 03:12:07,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt +18: [2022-12-02 03:12:07,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 5: [2022-12-02 03:12:07,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. + 5: [2022-12-02 03:12:07,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt + 5: [2022-12-02 03:12:07,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +24: [2022-12-02 03:12:07,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. +24: [2022-12-02 03:12:07,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt + 1: [2022-12-02 03:12:07,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +24: [2022-12-02 03:12:07,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 1: [2022-12-02 03:12:07,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt + 1: [2022-12-02 03:12:07,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +29: [2022-12-02 03:12:07,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. +29: [2022-12-02 03:12:07,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt +29: [2022-12-02 03:12:07,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +18: [2022-12-02 03:12:07,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. +18: [2022-12-02 03:12:07,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt +18: [2022-12-02 03:12:07,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 5: [2022-12-02 03:12:07,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. + 5: [2022-12-02 03:12:07,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt + 5: [2022-12-02 03:12:07,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 7: [2022-12-02 03:12:07,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. + 7: [2022-12-02 03:12:07,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt + 7: [2022-12-02 03:12:07,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! +24: [2022-12-02 03:12:07,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. +24: [2022-12-02 03:12:07,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7beta/global_step5494/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt +24: [2022-12-02 03:12:07,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! + 0: successfully saved checkpoint at iteration 5494 to checkpoints_8b7beta +63: ------------------------------------------------------------------------------------------------------------ +63: test loss at the end of training for test data | lm loss value: 2.098141E+00 | lm loss PPL: 8.151003E+00 | +63: ------------------------------------------------------------------------------------------------------------ +END 2098877: Fri Dec 2 03:12:27 EET 2022 diff --git a/sbatch_8b7_beta.sh b/sbatch_8b7_beta.sh new file mode 100755 index 0000000000000000000000000000000000000000..3f861ab7797a45b85a6fc202c1b41d67b040ba9f --- /dev/null +++ b/sbatch_8b7_beta.sh @@ -0,0 +1,160 @@ +#!/bin/bash +#SBATCH --nodes=64 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p pilot +#SBATCH -t 24:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=8b7beta + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT +# Start from scratch +rm -rf "$CHECKPOINT_PATH" "$TENSORBOARD_PATH" + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" + +PP_SIZE=2 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_9293M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 11522010000 +# -> Samples: 5625981 +TRAIN_SAMPLES=5_625_981 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 56_260 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/tensorboard/events.out.tfevents.1669911940.nid007434.102652.0 b/tensorboard/events.out.tfevents.1669911940.nid007434.102652.0 new file mode 100644 index 0000000000000000000000000000000000000000..bde3b9f45c27ee2b9c7d6cb6cee7aa1570f5dcd1 --- /dev/null +++ b/tensorboard/events.out.tfevents.1669911940.nid007434.102652.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc347d289f988ce84da5fc4b4c048f533e2096eda49f9341dee84ceeb0546743 +size 9808394