CogVideoX-5B-24frames_20steps-low_vram

Build error

App Files Files Community

tsqn commited on Nov 20, 2024

Commit

96cc85c

verified ·

1 Parent(s): bc1d5c6

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -23

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-THis is the main file for the gradio web demo. It uses the CogVideoX1.5-5B model to generate videos gradio web demo.
 set environment variable OPENAI_API_KEY to use the OpenAI API to enhance the prompt.
 Usage:
@@ -45,31 +45,31 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 #snapshot_download(repo_id="AlexWortega/RIFE", local_dir="model_rife")
 quantization = int8_weight_only
-transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX1.5-5B", subfolder="transformer", torch_dtype=torch.bfloat16)
-text_encoder = T5EncoderModel.from_pretrained("THUDM/CogVideoX1.5-5B", subfolder="text_encoder", torch_dtype=torch.bfloat16)
-vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX1.5-5B", subfolder="vae", torch_dtype=torch.bfloat16)
 quantize_(transformer, quantization())
 quantize_(text_encoder, quantization())
 # quantize_(vae, quantization())
 pipe = CogVideoXPipeline.from_pretrained(
-    "THUDM/CogVideoX1.5-5B",
     text_encoder=text_encoder,
     transformer=transformer,
     vae=vae,
     torch_dtype=torch.bfloat16
-    ).to("cpu")
 pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
-pipe.enable_model_cpu_offload()
 pipe.vae.enable_tiling()
 pipe.vae.enable_slicing()
 i2v_transformer = CogVideoXTransformer3DModel.from_pretrained(
-    "THUDM/CogVideoX1.5-5B-I2V", subfolder="transformer", torch_dtype=torch.bfloat16
 )
-i2v_text_encoder = T5EncoderModel.from_pretrained("THUDM/CogVideoX1.5-5B-I2V", subfolder="text_encoder", torch_dtype=torch.bfloat16)
-i2v_vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX1.5-5B-I2V", subfolder="vae", torch_dtype=torch.bfloat16)
 quantize_(i2v_transformer, quantization())
 quantize_(i2v_text_encoder, quantization())
@@ -240,7 +240,7 @@ def infer(
     if video_input is not None:
         video = load_video(video_input)[:49]  # Limit to 49 frames
         pipe_video = CogVideoXVideoToVideoPipeline.from_pretrained(
-            "THUDM/CogVideoX1.5-5B-",
             transformer=transformer,
             vae=vae,
             scheduler=pipe.scheduler,
@@ -249,7 +249,7 @@ def infer(
             torch_dtype=torch.bfloat16,
         ).to(device)
-        pipe_video.enable_model_cpu_offload()
         pipe_video.vae.enable_tiling()
         pipe_video.vae.enable_slicing()
         video_pt = pipe_video(
@@ -261,15 +261,15 @@ def infer(
             use_dynamic_cfg=True,
             output_type="pt",
             guidance_scale=guidance_scale,
-            generator=torch.Generator(device="cpu").manual_seed(seed),
         ).frames
-        pipe_video.to("cpu")
         del pipe_video
         gc.collect()
         torch.cuda.empty_cache()
     elif image_input is not None:
         pipe_image = CogVideoXImageToVideoPipeline.from_pretrained(
-            "THUDM/CogVideoX1.5-5B-I2V",
             transformer=i2v_transformer,
             vae=i2v_vae,
             scheduler=pipe.scheduler,
@@ -287,9 +287,9 @@ def infer(
             use_dynamic_cfg=True,
             output_type="pt",
             guidance_scale=guidance_scale,
-            generator=torch.Generator(device="cpu").manual_seed(seed),
         ).frames
-        pipe_image.to("cpu")
         del pipe_image
         gc.collect()
         torch.cuda.empty_cache()
@@ -303,9 +303,9 @@ def infer(
             use_dynamic_cfg=True,
             output_type="pt",
             guidance_scale=guidance_scale,
-            generator=torch.Generator(device="cpu").manual_seed(seed),
         ).frames
-        pipe.to("cpu")
         gc.collect()
     return (video_pt, seed)
@@ -342,17 +342,17 @@ examples_images = [["example_images/beach.png"], ["example_images/street.png"],
 with gr.Blocks() as demo:
     gr.Markdown("""
            <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
-               CogVideoX1.5-5B Huggingface Space🤗
            </div>
            <div style="text-align: center;">
-               <a href="https://huggingface.co/THUDM/CogVideoX1.5-5B">🤗 5B(T2V) Model Hub</a> |
-               <a href="https://huggingface.co/THUDM/CogVideoX1.5-5B-I2V">🤗 5B(I2V) Model Hub</a> |
                <a href="https://github.com/THUDM/CogVideo">🌐 Github</a> |
                <a href="https://arxiv.org/pdf/2408.06072">📜 arxiv </a>
            </div>
            <div style="text-align: center;display: flex;justify-content: center;align-items: center;margin-top: 1em;margin-bottom: .5em;">
               <span>If the Space is too busy, duplicate it to use privately</span>
-              <a href="https://huggingface.co/spaces/tsqn/CogVideoX1.5-5B-Space?duplicate=true"><img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-lg.svg" width="160" style="
                 margin-left: .75em;
             "></a>
            </div>

 """
+THis is the main file for the gradio web demo. It uses the CogVideoX-5B model to generate videos gradio web demo.
 set environment variable OPENAI_API_KEY to use the OpenAI API to enhance the prompt.
 Usage:
 #snapshot_download(repo_id="AlexWortega/RIFE", local_dir="model_rife")
 quantization = int8_weight_only
+transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-5B", subfolder="transformer", torch_dtype=torch.bfloat16)
+text_encoder = T5EncoderModel.from_pretrained("THUDM/CogVideoX-5B", subfolder="text_encoder", torch_dtype=torch.bfloat16)
+vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-5B", subfolder="vae", torch_dtype=torch.bfloat16)
 quantize_(transformer, quantization())
 quantize_(text_encoder, quantization())
 # quantize_(vae, quantization())
 pipe = CogVideoXPipeline.from_pretrained(
+    "THUDM/CogVideoX-5B",
     text_encoder=text_encoder,
     transformer=transformer,
     vae=vae,
     torch_dtype=torch.bfloat16
+    ).to(device)
 pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+# pipe.enable_model_cpu_offload()
 pipe.vae.enable_tiling()
 pipe.vae.enable_slicing()
 i2v_transformer = CogVideoXTransformer3DModel.from_pretrained(
+    "THUDM/CogVideoX-5B-I2V", subfolder="transformer", torch_dtype=torch.bfloat16
 )
+i2v_text_encoder = T5EncoderModel.from_pretrained("THUDM/CogVideoX-5B-I2V", subfolder="text_encoder", torch_dtype=torch.bfloat16)
+i2v_vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-5B-I2V", subfolder="vae", torch_dtype=torch.bfloat16)
 quantize_(i2v_transformer, quantization())
 quantize_(i2v_text_encoder, quantization())
     if video_input is not None:
         video = load_video(video_input)[:49]  # Limit to 49 frames
         pipe_video = CogVideoXVideoToVideoPipeline.from_pretrained(
+            "THUDM/CogVideoX-5B",
             transformer=transformer,
             vae=vae,
             scheduler=pipe.scheduler,
             torch_dtype=torch.bfloat16,
         ).to(device)
+        # pipe_video.enable_model_cpu_offload()
         pipe_video.vae.enable_tiling()
         pipe_video.vae.enable_slicing()
         video_pt = pipe_video(
             use_dynamic_cfg=True,
             output_type="pt",
             guidance_scale=guidance_scale,
+            generator=torch.Generator(device=device).manual_seed(seed),
         ).frames
+        pipe_video.to(device)
         del pipe_video
         gc.collect()
         torch.cuda.empty_cache()
     elif image_input is not None:
         pipe_image = CogVideoXImageToVideoPipeline.from_pretrained(
+            "THUDM/CogVideoX-5B-I2V",
             transformer=i2v_transformer,
             vae=i2v_vae,
             scheduler=pipe.scheduler,
             use_dynamic_cfg=True,
             output_type="pt",
             guidance_scale=guidance_scale,
+            generator=torch.Generator(device=device).manual_seed(seed),
         ).frames
+        pipe_image.to(device)
         del pipe_image
         gc.collect()
         torch.cuda.empty_cache()
             use_dynamic_cfg=True,
             output_type="pt",
             guidance_scale=guidance_scale,
+            generator=torch.Generator(device=device).manual_seed(seed),
         ).frames
+        pipe.to(device)
         gc.collect()
     return (video_pt, seed)
 with gr.Blocks() as demo:
     gr.Markdown("""
            <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
+               CogVideoX-5B Huggingface Space🤗
            </div>
            <div style="text-align: center;">
+               <a href="https://huggingface.co/THUDM/CogVideoX-5B">🤗 5B(T2V) Model Hub</a> |
+               <a href="https://huggingface.co/THUDM/CogVideoX-5B-I2V">🤗 5B(I2V) Model Hub</a> |
                <a href="https://github.com/THUDM/CogVideo">🌐 Github</a> |
                <a href="https://arxiv.org/pdf/2408.06072">📜 arxiv </a>
            </div>
            <div style="text-align: center;display: flex;justify-content: center;align-items: center;margin-top: 1em;margin-bottom: .5em;">
               <span>If the Space is too busy, duplicate it to use privately</span>
+              <a href="https://huggingface.co/spaces/tsqn/CogVideoX-5B-Space?duplicate=true"><img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-lg.svg" width="160" style="
                 margin-left: .75em;
             "></a>
            </div>