Spaces:

wchai
/

StableVideo

Runtime error

App Files Files Community

wchai commited on Aug 26, 2023

Commit

d515943

1 Parent(s): 3f6c15a

convert to CPU

Browse files

Files changed (6) hide show

annotator/midas/__init__.py +2 -2
app.py +109 -110
ckpt/cldm_v15.yaml +2 -0
requirements.txt +1 -0
stablevideo/atlas_data.py +1 -1
stablevideo/atlas_utils.py +1 -1

annotator/midas/__init__.py CHANGED Viewed

@@ -8,13 +8,13 @@ from .api import MiDaSInference
 class MidasDetector:
     def __init__(self):
-        self.model = MiDaSInference(model_type="dpt_hybrid").cuda()
     def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
         assert input_image.ndim == 3
         image_depth = input_image
         with torch.no_grad():
-            image_depth = torch.from_numpy(image_depth).float().cuda()
             image_depth = image_depth / 127.5 - 1.0
             image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
             depth = self.model(image_depth)[0]

 class MidasDetector:
     def __init__(self):
+        self.model = MiDaSInference(model_type="dpt_hybrid")
     def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
         assert input_image.ndim == 3
         image_depth = input_image
         with torch.no_grad():
+            image_depth = torch.from_numpy(image_depth).float()
             image_depth = image_depth / 127.5 - 1.0
             image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
             depth = self.model(image_depth)[0]

app.py CHANGED Viewed

@@ -48,7 +48,7 @@ class StableVideo:
     ):
         self.apply_canny = CannyDetector()
         canny_model = create_model(base_cfg).cpu()
-        canny_model.load_state_dict(load_state_dict(canny_model_cfg, location='cuda'), strict=False)
         self.canny_ddim_sampler = DDIMSampler(canny_model)
         self.canny_model = canny_model
@@ -59,7 +59,7 @@ class StableVideo:
     ):
         self.apply_midas = MidasDetector()
         depth_model = create_model(base_cfg).cpu()
-        depth_model.load_state_dict(load_state_dict(depth_model_cfg, location='cuda'), strict=False)
         self.depth_ddim_sampler = DDIMSampler(depth_model)
         self.depth_model = depth_model
@@ -101,7 +101,7 @@ class StableVideo:
         detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
         control = torch.stack([control for _ in range(1)], dim=0)
         control = einops.rearrange(control, 'b h w c -> b c h w').clone()
@@ -128,7 +128,7 @@ class StableVideo:
     @torch.no_grad()
     def edit_background(self, *args, **kwargs):
-        self.depth_model = self.depth_model.cuda()
         input_image = self.b_atlas_origin
         self.depth_edit(input_image, *args, **kwargs)
@@ -155,7 +155,7 @@ class StableVideo:
                                 if_net=False,
                                 num_samples=1):
-        self.canny_model = self.canny_model.cuda()
         keyframes = [int(x) for x in keyframes.split(",")]
         if self.data is None:
@@ -186,7 +186,7 @@ class StableVideo:
             # get canny control
             detected_map = self.apply_canny(img, low_threshold, high_threshold)
             detected_map = HWC3(detected_map)
-            control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
             control = einops.rearrange(control.unsqueeze(0), 'b h w c -> b c h w').clone()
             cond = {"c_concat": [control], "c_crossattn": c_crossattn}
@@ -195,7 +195,7 @@ class StableVideo:
             # if not the key frame, calculate the mapping from last atlas
             if i == 0:
-                latent = torch.randn((1, 4, H // 8, W // 8)).cuda()
                 samples, _ = self.canny_ddim_sampler.sample(ddim_steps, num_samples,
                                                             shape, cond, verbose=False, eta=eta,
                                                             unconditional_guidance_scale=scale,
@@ -209,7 +209,7 @@ class StableVideo:
                 mapped_img = mapped_img.resize((W, H))
                 mapped_img = np.array(mapped_img).astype(np.float32) / 255.0
                 mapped_img = mapped_img[None].transpose(0, 3, 1, 2)
-                mapped_img = torch.from_numpy(mapped_img).cuda()
                 mapped_img = 2. * mapped_img - 1.
                 latent = self.canny_model.get_first_stage_encoding(self.canny_model.encode_first_stage(mapped_img))
@@ -232,7 +232,7 @@ class StableVideo:
             result = alpha * result
             # buffer for training
-            result_copy = result.clone().cuda()
             result_copy.requires_grad = True
             result_list.append(result_copy)
@@ -259,7 +259,7 @@ class StableVideo:
             #           aggregate net           #
             #####################################
             lr, n_epoch = 1e-3, 500
-            agg_net = AGGNet().cuda()
             loss_fn = nn.L1Loss()
             optimizer = optim.SGD(agg_net.parameters(), lr=lr, momentum=0.9)
             for _ in range(n_epoch):
@@ -291,12 +291,12 @@ class StableVideo:
     def render(self, f_atlas, b_atlas):
         # foreground
         if f_atlas == None:
-            f_atlas = transforms.ToTensor()(self.f_atlas_origin).unsqueeze(0).cuda()
         else:
             f_atlas, mask = f_atlas["image"], f_atlas["mask"]
-            f_atlas_origin = transforms.ToTensor()(self.f_atlas_origin).unsqueeze(0).cuda()
-            f_atlas = transforms.ToTensor()(f_atlas).unsqueeze(0).cuda()
-            mask = transforms.ToTensor()(mask).unsqueeze(0).cuda()
             if f_atlas.shape != mask.shape:
                 print("Warning: truncating mask to atlas shape {}".format(f_atlas.shape))
                 mask = mask[:f_atlas.shape[0], :f_atlas.shape[1], :f_atlas.shape[2], :f_atlas.shape[3]]
@@ -326,7 +326,7 @@ class StableVideo:
         if b_atlas == None:
             b_atlas = self.b_atlas_origin
-        b_atlas = transforms.ToTensor()(b_atlas).unsqueeze(0).cuda()
         background_edit = F.grid_sample(
             b_atlas, self.data.scaled_background_uvs, mode="bilinear", align_corners=self.data.config["align_corners"]
         ).clamp(min=0.0, max=1.0)
@@ -349,99 +349,98 @@ class StableVideo:
         return save_name
 if __name__ == '__main__':
-    with torch.cuda.amp.autocast():
-        stablevideo = StableVideo(base_cfg="ckpt/cldm_v15.yaml",
-                                canny_model_cfg="ckpt/control_sd15_canny.pth",
-                                depth_model_cfg="ckpt/control_sd15_depth.pth",
-                                save_memory=True)
-        stablevideo.load_canny_model()
-        stablevideo.load_depth_model()
-        block = gr.Blocks().queue()
-        with block:
-            with gr.Row():
-                gr.Markdown("## StableVideo")
-            with gr.Row():
-                with gr.Column():
-                    original_video = gr.Video(label="Original Video", interactive=False)
-                    with gr.Row():
-                        foreground_atlas = gr.Image(label="Foreground Atlas", type="pil")
-                        background_atlas = gr.Image(label="Background Atlas", type="pil")
-                    gr.Markdown("### Step 1. select one example video and click **Load Video** buttom and wait for 10 sec.")
-                    avail_video = [f.name for f in os.scandir("data") if f.is_dir()]
-                    video_name = gr.Radio(choices=avail_video,
-                                        label="Select Example Videos",
-                                        value="car-turn")
-                    load_video_button = gr.Button("Load Video")
-                    gr.Markdown("### Step 2. write text prompt and advanced options for background and foreground.")
-                    with gr.Row():
-                        f_prompt = gr.Textbox(label="Foreground Prompt", value="a picture of an orange suv")
-                        b_prompt = gr.Textbox(label="Background Prompt", value="winter scene, snowy scene, beautiful snow")
-                    with gr.Row():
-                        with gr.Accordion("Advanced Foreground Options", open=False):
-                            adv_keyframes = gr.Textbox(label="keyframe", value="20, 40, 60")
-                            adv_atlas_resolution = gr.Slider(label="Atlas Resolution", minimum=1000, maximum=3000, value=2000, step=100)
-                            adv_image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=256)
-                            adv_low_threshold = gr.Slider(label="Canny low threshold", minimum=1, maximum=255, value=100, step=1)
-                            adv_high_threshold = gr.Slider(label="Canny high threshold", minimum=1, maximum=255, value=200, step=1)
-                            adv_ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                            adv_s = gr.Slider(label="Noise Scale", minimum=0.0, maximum=1.0, value=0.8, step=0.01)
-                            adv_scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=15.0, value=9.0, step=0.1)
-                            adv_seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                            adv_eta = gr.Number(label="eta (DDIM)", value=0.0)
-                            adv_a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed, no background')
-                            adv_n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-                            adv_if_net = gr.gradio.Checkbox(label="if use agg net", value=False)
-                        with gr.Accordion("Background Options", open=False):
-                            b_image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=256)
-                            b_detect_resolution = gr.Slider(label="Depth Resolution", minimum=128, maximum=1024, value=512, step=1)
-                            b_ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                            b_scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                            b_seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                            b_eta = gr.Number(label="eta (DDIM)", value=0.0)
-                            b_a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
-                            b_n_prompt = gr.Textbox(label="Negative Prompt",
-                                                value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-                    gr.Markdown("### Step 3. edit each one and render.")
-                    with gr.Row():
-                        f_advance_run_button = gr.Button("Advanced Edit Foreground (slower, better)")
-                        b_run_button = gr.Button("Edit Background")
-                    run_button = gr.Button("Render")
-                with gr.Column():
-                    output_video = gr.Video(label="Output Video", interactive=False)
-                    # output_foreground_atlas = gr.Image(label="Output Foreground Atlas", type="pil", interactive=False)
-                    output_foreground_atlas = gr.ImageMask(label="Editable Output Foreground Atlas", type="pil", tool="sketch", interactive=True)
-                    output_background_atlas = gr.Image(label="Output Background Atlas", type="pil", interactive=False)
-            # edit param
-            f_adv_edit_param = [adv_keyframes,
-                                adv_atlas_resolution,
-                                f_prompt,
-                                adv_a_prompt,
-                                adv_n_prompt,
-                                adv_image_resolution,
-                                adv_low_threshold,
-                                adv_high_threshold,
-                                adv_ddim_steps,
-                                adv_s,
-                                adv_scale,
-                                adv_seed,
-                                adv_eta,
-                                adv_if_net]
-            b_edit_param = [b_prompt,
-                            b_a_prompt,
-                            b_n_prompt,
-                            b_image_resolution,
-                            b_detect_resolution,
-                            b_ddim_steps,
-                            b_scale,
-                            b_seed,
-                            b_eta]
-            # action
-            load_video_button.click(fn=stablevideo.load_video, inputs=video_name, outputs=[original_video, foreground_atlas, background_atlas])
-            f_advance_run_button.click(fn=stablevideo.advanced_edit_foreground, inputs=f_adv_edit_param, outputs=[output_foreground_atlas])
-            b_run_button.click(fn=stablevideo.edit_background, inputs=b_edit_param, outputs=[output_background_atlas])
-            run_button.click(fn=stablevideo.render, inputs=[output_foreground_atlas, output_background_atlas], outputs=[output_video])
-        block.launch(share=True)

     ):
         self.apply_canny = CannyDetector()
         canny_model = create_model(base_cfg).cpu()
+        canny_model.load_state_dict(load_state_dict(canny_model_cfg, location='cpu'), strict=False)
         self.canny_ddim_sampler = DDIMSampler(canny_model)
         self.canny_model = canny_model
     ):
         self.apply_midas = MidasDetector()
         depth_model = create_model(base_cfg).cpu()
+        depth_model.load_state_dict(load_state_dict(depth_model_cfg, location='cpu'), strict=False)
         self.depth_ddim_sampler = DDIMSampler(depth_model)
         self.depth_model = depth_model
         detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        control = torch.from_numpy(detected_map.copy()).float() / 255.0
         control = torch.stack([control for _ in range(1)], dim=0)
         control = einops.rearrange(control, 'b h w c -> b c h w').clone()
     @torch.no_grad()
     def edit_background(self, *args, **kwargs):
+        self.depth_model = self.depth_model
         input_image = self.b_atlas_origin
         self.depth_edit(input_image, *args, **kwargs)
                                 if_net=False,
                                 num_samples=1):
+        self.canny_model = self.canny_model
         keyframes = [int(x) for x in keyframes.split(",")]
         if self.data is None:
             # get canny control
             detected_map = self.apply_canny(img, low_threshold, high_threshold)
             detected_map = HWC3(detected_map)
+            control = torch.from_numpy(detected_map.copy()).float() / 255.0
             control = einops.rearrange(control.unsqueeze(0), 'b h w c -> b c h w').clone()
             cond = {"c_concat": [control], "c_crossattn": c_crossattn}
             # if not the key frame, calculate the mapping from last atlas
             if i == 0:
+                latent = torch.randn((1, 4, H // 8, W // 8))
                 samples, _ = self.canny_ddim_sampler.sample(ddim_steps, num_samples,
                                                             shape, cond, verbose=False, eta=eta,
                                                             unconditional_guidance_scale=scale,
                 mapped_img = mapped_img.resize((W, H))
                 mapped_img = np.array(mapped_img).astype(np.float32) / 255.0
                 mapped_img = mapped_img[None].transpose(0, 3, 1, 2)
+                mapped_img = torch.from_numpy(mapped_img)
                 mapped_img = 2. * mapped_img - 1.
                 latent = self.canny_model.get_first_stage_encoding(self.canny_model.encode_first_stage(mapped_img))
             result = alpha * result
             # buffer for training
+            result_copy = result.clone()
             result_copy.requires_grad = True
             result_list.append(result_copy)
             #           aggregate net           #
             #####################################
             lr, n_epoch = 1e-3, 500
+            agg_net = AGGNet()
             loss_fn = nn.L1Loss()
             optimizer = optim.SGD(agg_net.parameters(), lr=lr, momentum=0.9)
             for _ in range(n_epoch):
     def render(self, f_atlas, b_atlas):
         # foreground
         if f_atlas == None:
+            f_atlas = transforms.ToTensor()(self.f_atlas_origin).unsqueeze(0)
         else:
             f_atlas, mask = f_atlas["image"], f_atlas["mask"]
+            f_atlas_origin = transforms.ToTensor()(self.f_atlas_origin).unsqueeze(0)
+            f_atlas = transforms.ToTensor()(f_atlas).unsqueeze(0)
+            mask = transforms.ToTensor()(mask).unsqueeze(0)
             if f_atlas.shape != mask.shape:
                 print("Warning: truncating mask to atlas shape {}".format(f_atlas.shape))
                 mask = mask[:f_atlas.shape[0], :f_atlas.shape[1], :f_atlas.shape[2], :f_atlas.shape[3]]
         if b_atlas == None:
             b_atlas = self.b_atlas_origin
+        b_atlas = transforms.ToTensor()(b_atlas).unsqueeze(0)
         background_edit = F.grid_sample(
             b_atlas, self.data.scaled_background_uvs, mode="bilinear", align_corners=self.data.config["align_corners"]
         ).clamp(min=0.0, max=1.0)
         return save_name
 if __name__ == '__main__':
+    stablevideo = StableVideo(base_cfg="ckpt/cldm_v15.yaml",
+                            canny_model_cfg="ckpt/control_sd15_canny.pth",
+                            depth_model_cfg="ckpt/control_sd15_depth.pth",
+                            save_memory=True)
+    stablevideo.load_canny_model()
+    stablevideo.load_depth_model()
+    block = gr.Blocks().queue()
+    with block:
+        with gr.Row():
+            gr.Markdown("## StableVideo")
+        with gr.Row():
+            with gr.Column():
+                original_video = gr.Video(label="Original Video", interactive=False)
+                with gr.Row():
+                    foreground_atlas = gr.Image(label="Foreground Atlas", type="pil")
+                    background_atlas = gr.Image(label="Background Atlas", type="pil")
+                gr.Markdown("### Step 1. select one example video and click **Load Video** buttom and wait for 10 sec.")
+                avail_video = [f.name for f in os.scandir("data") if f.is_dir()]
+                video_name = gr.Radio(choices=avail_video,
+                                    label="Select Example Videos",
+                                    value="car-turn")
+                load_video_button = gr.Button("Load Video")
+                gr.Markdown("### Step 2. write text prompt and advanced options for background and foreground.")
+                with gr.Row():
+                    f_prompt = gr.Textbox(label="Foreground Prompt", value="a picture of an orange suv")
+                    b_prompt = gr.Textbox(label="Background Prompt", value="winter scene, snowy scene, beautiful snow")
+                with gr.Row():
+                    with gr.Accordion("Advanced Foreground Options", open=False):
+                        adv_keyframes = gr.Textbox(label="keyframe", value="20, 40, 60")
+                        adv_atlas_resolution = gr.Slider(label="Atlas Resolution", minimum=1000, maximum=3000, value=2000, step=100)
+                        adv_image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=256)
+                        adv_low_threshold = gr.Slider(label="Canny low threshold", minimum=1, maximum=255, value=100, step=1)
+                        adv_high_threshold = gr.Slider(label="Canny high threshold", minimum=1, maximum=255, value=200, step=1)
+                        adv_ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
+                        adv_s = gr.Slider(label="Noise Scale", minimum=0.0, maximum=1.0, value=0.8, step=0.01)
+                        adv_scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=15.0, value=9.0, step=0.1)
+                        adv_seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
+                        adv_eta = gr.Number(label="eta (DDIM)", value=0.0)
+                        adv_a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed, no background')
+                        adv_n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+                        adv_if_net = gr.gradio.Checkbox(label="if use agg net", value=False)
+                    with gr.Accordion("Background Options", open=False):
+                        b_image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=256)
+                        b_detect_resolution = gr.Slider(label="Depth Resolution", minimum=128, maximum=1024, value=512, step=1)
+                        b_ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
+                        b_scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                        b_seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
+                        b_eta = gr.Number(label="eta (DDIM)", value=0.0)
+                        b_a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
+                        b_n_prompt = gr.Textbox(label="Negative Prompt",
+                                            value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+                gr.Markdown("### Step 3. edit each one and render.")
+                with gr.Row():
+                    f_advance_run_button = gr.Button("Advanced Edit Foreground (slower, better)")
+                    b_run_button = gr.Button("Edit Background")
+                run_button = gr.Button("Render")
+            with gr.Column():
+                output_video = gr.Video(label="Output Video", interactive=False)
+                # output_foreground_atlas = gr.Image(label="Output Foreground Atlas", type="pil", interactive=False)
+                output_foreground_atlas = gr.ImageMask(label="Editable Output Foreground Atlas", type="pil", tool="sketch", interactive=True)
+                output_background_atlas = gr.Image(label="Output Background Atlas", type="pil", interactive=False)
+        # edit param
+        f_adv_edit_param = [adv_keyframes,
+                            adv_atlas_resolution,
+                            f_prompt,
+                            adv_a_prompt,
+                            adv_n_prompt,
+                            adv_image_resolution,
+                            adv_low_threshold,
+                            adv_high_threshold,
+                            adv_ddim_steps,
+                            adv_s,
+                            adv_scale,
+                            adv_seed,
+                            adv_eta,
+                            adv_if_net]
+        b_edit_param = [b_prompt,
+                        b_a_prompt,
+                        b_n_prompt,
+                        b_image_resolution,
+                        b_detect_resolution,
+                        b_ddim_steps,
+                        b_scale,
+                        b_seed,
+                        b_eta]
+        # action
+        load_video_button.click(fn=stablevideo.load_video, inputs=video_name, outputs=[original_video, foreground_atlas, background_atlas])
+        f_advance_run_button.click(fn=stablevideo.advanced_edit_foreground, inputs=f_adv_edit_param, outputs=[output_foreground_atlas])
+        b_run_button.click(fn=stablevideo.edit_background, inputs=b_edit_param, outputs=[output_background_atlas])
+        run_button.click(fn=stablevideo.render, inputs=[output_foreground_atlas, output_background_atlas], outputs=[output_video])
+    block.launch()

ckpt/cldm_v15.yaml CHANGED Viewed

@@ -77,3 +77,5 @@ model:
     cond_stage_config:
       target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

     cond_stage_config:
       target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      params:
+        device: "cpu"

requirements.txt CHANGED Viewed

@@ -120,3 +120,4 @@ wcwidth==0.2.6
 websockets==11.0.3
 Werkzeug==2.3.7
 yarl==1.9.2

 websockets==11.0.3
 Werkzeug==2.3.7
 yarl==1.9.2
+xformers

stablevideo/atlas_data.py CHANGED Viewed

@@ -30,7 +30,7 @@ class AtlasData():
             maximum_number_of_frames = json_dict["maximum_number_of_frames"]
         config = {
-            "device": "cuda",
             "checkpoint_path": f"data/{video_name}/checkpoint.ckpt",
             "resx": json_dict["resx"],
             "resy": json_dict["resy"],

             maximum_number_of_frames = json_dict["maximum_number_of_frames"]
         config = {
+            "device": "cpu",
             "checkpoint_path": f"data/{video_name}/checkpoint.ckpt",
             "resx": json_dict["resx"],
             "resy": json_dict["resy"],

stablevideo/atlas_utils.py CHANGED Viewed

@@ -72,7 +72,7 @@ def load_neural_atlases_models(config):
         skip_layers=[],
     ).to(config["device"])
-    checkpoint = torch.load(config["checkpoint_path"])
     foreground_mapping.load_state_dict(checkpoint["model_F_mapping1_state_dict"])
     background_mapping.load_state_dict(checkpoint["model_F_mapping2_state_dict"])
     foreground_atlas_model.load_state_dict(checkpoint["F_atlas_state_dict"])

         skip_layers=[],
     ).to(config["device"])
+    checkpoint = torch.load(config["checkpoint_path"], map_location=torch.device('cpu'))
     foreground_mapping.load_state_dict(checkpoint["model_F_mapping1_state_dict"])
     background_mapping.load_state_dict(checkpoint["model_F_mapping2_state_dict"])
     foreground_atlas_model.load_state_dict(checkpoint["F_atlas_state_dict"])