Spaces:

openfree
/

ginigen-sora

Running

App Files Files Community

Sapir Weissbuch commited on Nov 10, 2024

Commit

7f1b6d2

unverified ·

2 Parent(s): 05cb3e4 39316ac

Merge pull request #26 from LightricksResearch/cuda_optional

Browse files

Files changed (3) hide show

.gitignore +4 -1
inference.py +21 -7
xora/pipelines/pipeline_xora_video.py +6 -1

.gitignore CHANGED Viewed

@@ -159,4 +159,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/

 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# From inference.py
+video_output_*.mp4

inference.py CHANGED Viewed

@@ -55,7 +55,9 @@ def load_vae(vae_dir):
     vae = CausalVideoAutoencoder.from_config(vae_config)
     vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
     vae.load_state_dict(vae_state_dict)
-    return vae.cuda().to(torch.bfloat16)
 def load_unet(unet_dir):
@@ -65,7 +67,9 @@ def load_unet(unet_dir):
     transformer = Transformer3DModel.from_config(transformer_config)
     unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
     transformer.load_state_dict(unet_state_dict, strict=True)
-    return transformer.cuda()
 def load_scheduler(scheduler_dir):
@@ -254,7 +258,9 @@ def main():
     patchifier = SymmetricPatchifier(patch_size=1)
     text_encoder = T5EncoderModel.from_pretrained(
         "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
-    ).to("cuda")
     tokenizer = T5Tokenizer.from_pretrained(
         "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
     )
@@ -272,7 +278,9 @@ def main():
         "vae": vae,
     }
-    pipeline = XoraVideoPipeline(**submodel_dict).to("cuda")
     # Prepare input for the pipeline
     sample = {
@@ -286,8 +294,12 @@ def main():
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    torch.cuda.manual_seed(args.seed)
-    generator = torch.Generator(device="cuda").manual_seed(args.seed)
     images = pipeline(
         num_inference_steps=args.num_inference_steps,
@@ -322,7 +334,9 @@ def main():
         )
     for i in range(images.shape[0]):
-        video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
         video_np = (video_np * 255).astype(np.uint8)
         fps = args.frame_rate
         height, width = video_np.shape[1:3]

     vae = CausalVideoAutoencoder.from_config(vae_config)
     vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
     vae.load_state_dict(vae_state_dict)
+    if torch.cuda.is_available():
+        vae = vae.cuda()
+    return vae.to(torch.bfloat16)
 def load_unet(unet_dir):
     transformer = Transformer3DModel.from_config(transformer_config)
     unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
     transformer.load_state_dict(unet_state_dict, strict=True)
+    if torch.cuda.is_available():
+        transformer = transformer.cuda()
+    return transformer
 def load_scheduler(scheduler_dir):
     patchifier = SymmetricPatchifier(patch_size=1)
     text_encoder = T5EncoderModel.from_pretrained(
         "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
+    )
+    if torch.cuda.is_available():
+        text_encoder = text_encoder.to("cuda")
     tokenizer = T5Tokenizer.from_pretrained(
         "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
     )
         "vae": vae,
     }
+    pipeline = XoraVideoPipeline(**submodel_dict)
+    if torch.cuda.is_available():
+        pipeline = pipeline.to("cuda")
     # Prepare input for the pipeline
     sample = {
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(args.seed)
+    generator = torch.Generator(
+        device="cuda" if torch.cuda.is_available() else "cpu"
+    ).manual_seed(args.seed)
     images = pipeline(
         num_inference_steps=args.num_inference_steps,
         )
     for i in range(images.shape[0]):
+        # Gathering from B, C, F, H, W to C, F, H, W and then permuting to F, H, W, C
+        video_np = images[i].permute(1, 2, 3, 0).cpu().float().numpy()
+        # Unnormalizing images to [0, 255] range
         video_np = (video_np * 255).astype(np.uint8)
         fps = args.frame_rate
         height, width = video_np.shape[1:3]

xora/pipelines/pipeline_xora_video.py CHANGED Viewed

@@ -1010,7 +1010,12 @@ class XoraVideoPipeline(DiffusionPipeline):
                     current_timestep = current_timestep * (1 - conditioning_mask)
                 # Choose the appropriate context manager based on `mixed_precision`
                 if mixed_precision:
-                    context_manager = torch.autocast("cuda", dtype=torch.bfloat16)
                 else:
                     context_manager = nullcontext()  # Dummy context manager

                     current_timestep = current_timestep * (1 - conditioning_mask)
                 # Choose the appropriate context manager based on `mixed_precision`
                 if mixed_precision:
+                    if "xla" in device.type:
+                        raise NotImplementedError(
+                            "Mixed precision is not supported yet on XLA devices."
+                        )
+                    context_manager = torch.autocast(device, dtype=torch.bfloat16)
                 else:
                     context_manager = nullcontext()  # Dummy context manager