CogVideoX-5B-24frames_20steps-low_vram

Build error

App Files Files Community

tsqn commited on Nov 20, 2024

Commit

c628a1c

verified ·

1 Parent(s): 3667bd9

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -147

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ import utils
 #from huggingface_hub import hf_hub_download, snapshot_download
 import gc
-device = "cuda" if torch.cuda.is_available() else "cpu"
 #hf_hub_download(repo_id="ai-forever/Real-ESRGAN", filename="RealESRGAN_x4.pth", local_dir="model_real_esran")
 #snapshot_download(repo_id="AlexWortega/RIFE", local_dir="model_rife")
@@ -65,14 +65,14 @@ pipe.enable_model_cpu_offload()
 pipe.vae.enable_tiling()
 pipe.vae.enable_slicing()
-i2v_transformer = CogVideoXTransformer3DModel.from_pretrained(
-    "THUDM/CogVideoX-5B-I2V", subfolder="transformer", torch_dtype=torch.bfloat16
-)
-i2v_text_encoder = T5EncoderModel.from_pretrained("THUDM/CogVideoX-5B-I2V", subfolder="text_encoder", torch_dtype=torch.bfloat16)
-i2v_vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-5B-I2V", subfolder="vae", torch_dtype=torch.bfloat16)
-quantize_(i2v_transformer, quantization())
-quantize_(i2v_text_encoder, quantization())
 # quantize_(i2v_vae, quantization())
 # pipe.transformer.to(memory_format=torch.channels_last)
@@ -100,78 +100,78 @@ Video descriptions must have the same num of words as examples below. Extra word
 """
-def resize_if_unfit(input_video, progress=gr.Progress(track_tqdm=True)):
-    width, height = get_video_dimensions(input_video)
-    if width == 720 and height == 480:
-        processed_video = input_video
-    else:
-        processed_video = center_crop_resize(input_video)
-    return processed_video
-def get_video_dimensions(input_video_path):
-    reader = imageio_ffmpeg.read_frames(input_video_path)
-    metadata = next(reader)
-    return metadata["size"]
-def center_crop_resize(input_video_path, target_width=720, target_height=480):
-    cap = cv2.VideoCapture(input_video_path)
-    orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    orig_fps = cap.get(cv2.CAP_PROP_FPS)
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    width_factor = target_width / orig_width
-    height_factor = target_height / orig_height
-    resize_factor = max(width_factor, height_factor)
-    inter_width = int(orig_width * resize_factor)
-    inter_height = int(orig_height * resize_factor)
-    target_fps = 8
-    ideal_skip = max(0, math.ceil(orig_fps / target_fps) - 1)
-    skip = min(5, ideal_skip)  # Cap at 5
-    while (total_frames / (skip + 1)) < 49 and skip > 0:
-        skip -= 1
-    processed_frames = []
-    frame_count = 0
-    total_read = 0
-    while frame_count < 49 and total_read < total_frames:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        if total_read % (skip + 1) == 0:
-            resized = cv2.resize(frame, (inter_width, inter_height), interpolation=cv2.INTER_AREA)
-            start_x = (inter_width - target_width) // 2
-            start_y = (inter_height - target_height) // 2
-            cropped = resized[start_y : start_y + target_height, start_x : start_x + target_width]
-            processed_frames.append(cropped)
-            frame_count += 1
-        total_read += 1
-    cap.release()
-    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
-        temp_video_path = temp_file.name
-        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-        out = cv2.VideoWriter(temp_video_path, fourcc, target_fps, (target_width, target_height))
-        for frame in processed_frames:
-            out.write(frame)
-        out.release()
-    return temp_video_path
 # def convert_prompt(prompt: str, retry_times: int = 3) -> str:
@@ -226,9 +226,9 @@ def center_crop_resize(input_video_path, target_width=720, target_height=480):
 def infer(
     prompt: str,
-    image_input: str,
-    video_input: str,
-    video_strenght: float,
     num_inference_steps: int,
     guidance_scale: float,
     seed: int = -1,
@@ -237,76 +237,76 @@ def infer(
     if seed == -1:
         seed = random.randint(0, 2**8 - 1)
-    if video_input is not None:
-        video = load_video(video_input)[:49]  # Limit to 49 frames
-        pipe_video = CogVideoXVideoToVideoPipeline.from_pretrained(
-            "THUDM/CogVideoX-5B",
-            transformer=transformer,
-            vae=vae,
-            scheduler=pipe.scheduler,
-            tokenizer=pipe.tokenizer,
-            text_encoder=text_encoder,
-            torch_dtype=torch.bfloat16,
-        ).to(device)
-        # pipe_video.enable_model_cpu_offload()
-        pipe_video.vae.enable_tiling()
-        pipe_video.vae.enable_slicing()
-        video_pt = pipe_video(
-            video=video,
-            prompt=prompt,
-            num_inference_steps=num_inference_steps,
-            num_videos_per_prompt=1,
-            strength=video_strenght,
-            use_dynamic_cfg=True,
-            output_type="pt",
-            guidance_scale=guidance_scale,
-            generator=torch.Generator(device="cpu").manual_seed(seed),
-        ).frames
-        pipe_video.to("cpu")
-        del pipe_video
-        gc.collect()
-        torch.cuda.empty_cache()
-    elif image_input is not None:
-        pipe_image = CogVideoXImageToVideoPipeline.from_pretrained(
-            "THUDM/CogVideoX-5B-I2V",
-            transformer=i2v_transformer,
-            vae=i2v_vae,
-            scheduler=pipe.scheduler,
-            tokenizer=pipe.tokenizer,
-            text_encoder=i2v_text_encoder,
-            torch_dtype=torch.bfloat16,
-        ).to(device)
-        image_input = Image.fromarray(image_input).resize(size=(720, 480))  # Convert to PIL
-        image = load_image(image_input)
-        video_pt = pipe_image(
-            image=image,
-            prompt=prompt,
-            num_inference_steps=num_inference_steps,
-            num_videos_per_prompt=1,
-            use_dynamic_cfg=True,
-            output_type="pt",
-            guidance_scale=guidance_scale,
-            generator=torch.Generator(device="cpu").manual_seed(seed),
-        ).frames
-        pipe_image.to("cpu")
-        del pipe_image
-        gc.collect()
-        torch.cuda.empty_cache()
-    else:
-        pipe.to(device)
-        video_pt = pipe(
-            prompt=prompt,
-            num_videos_per_prompt=1,
-            num_inference_steps=num_inference_steps,
-            num_frames=24,
-            use_dynamic_cfg=True,
-            output_type="pt",
-            guidance_scale=guidance_scale,
-            generator=torch.Generator(device="cpu").manual_seed(seed),
-        ).frames
-        pipe.to("cpu")
-        gc.collect()
     return (video_pt, seed)
@@ -362,13 +362,13 @@ with gr.Blocks() as demo:
            """)
     with gr.Row():
         with gr.Column():
-            with gr.Accordion("I2V: Image Input (cannot be used simultaneously with video input)", open=False):
-                image_input = gr.Image(label="Input Image (will be cropped to 720 * 480)")
-                examples_component_images = gr.Examples(examples_images, inputs=[image_input], cache_examples=False)
-            with gr.Accordion("V2V: Video Input (cannot be used simultaneously with image input)", open=False):
-                video_input = gr.Video(label="Input Video (will be cropped to 49 frames, 6 seconds at 8fps)")
-                strength = gr.Slider(0.1, 1.0, value=0.8, step=0.01, label="Strength")
-                examples_component_videos = gr.Examples(examples_videos, inputs=[video_input], cache_examples=False)
             prompt = gr.Textbox(label="Prompt (Less than 200 Words)", placeholder="Enter your prompt here", lines=5)
             # with gr.Row():
@@ -465,9 +465,9 @@ with gr.Blocks() as demo:
     @spaces.GPU(duration=120)
     def generate(
         prompt,
-        image_input,
-        video_input,
-        video_strength,
         seed_value,
         # scale_status,
         # rife_status,
@@ -475,10 +475,10 @@ with gr.Blocks() as demo:
     ):
         latents, seed = infer(
             prompt,
-            image_input,
-            video_input,
-            video_strength,
-            num_inference_steps=20,  # Changed from 50
             guidance_scale=7.0,  # NOT Changed
             seed=seed_value,
             progress=progress,
@@ -511,13 +511,14 @@ with gr.Blocks() as demo:
     generate_button.click(
         generate,
-        inputs=[prompt, image_input, video_input, strength, seed_param],
         # inputs=[prompt, image_input, video_input, strength, seed_param, enable_scale, enable_rife],
         outputs=[video_output, download_video_button, download_gif_button, seed_text],
     )
     # enhance_button.click(enhance_prompt_func, inputs=[prompt], outputs=[prompt])
-    video_input.upload(resize_if_unfit, inputs=[video_input], outputs=[video_input])
 if __name__ == "__main__":
     utils.install_packages()

 #from huggingface_hub import hf_hub_download, snapshot_download
 import gc
+#device = "cuda" if torch.cuda.is_available() else "cpu"
 #hf_hub_download(repo_id="ai-forever/Real-ESRGAN", filename="RealESRGAN_x4.pth", local_dir="model_real_esran")
 #snapshot_download(repo_id="AlexWortega/RIFE", local_dir="model_rife")
 pipe.vae.enable_tiling()
 pipe.vae.enable_slicing()
+# i2v_transformer = CogVideoXTransformer3DModel.from_pretrained(
+#     "THUDM/CogVideoX-5B-I2V", subfolder="transformer", torch_dtype=torch.bfloat16
+# )
+# i2v_text_encoder = T5EncoderModel.from_pretrained("THUDM/CogVideoX-5B-I2V", subfolder="text_encoder", torch_dtype=torch.bfloat16)
+# i2v_vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-5B-I2V", subfolder="vae", torch_dtype=torch.bfloat16)
+# quantize_(i2v_transformer, quantization())
+# quantize_(i2v_text_encoder, quantization())
 # quantize_(i2v_vae, quantization())
 # pipe.transformer.to(memory_format=torch.channels_last)
 """
+# def resize_if_unfit(input_video, progress=gr.Progress(track_tqdm=True)):
+#     width, height = get_video_dimensions(input_video)
+#     if width == 720 and height == 480:
+#         processed_video = input_video
+#     else:
+#         processed_video = center_crop_resize(input_video)
+#     return processed_video
+# def get_video_dimensions(input_video_path):
+#     reader = imageio_ffmpeg.read_frames(input_video_path)
+#     metadata = next(reader)
+#     return metadata["size"]
+# def center_crop_resize(input_video_path, target_width=720, target_height=480):
+#     cap = cv2.VideoCapture(input_video_path)
+#     orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+#     orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+#     orig_fps = cap.get(cv2.CAP_PROP_FPS)
+#     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+#     width_factor = target_width / orig_width
+#     height_factor = target_height / orig_height
+#     resize_factor = max(width_factor, height_factor)
+#     inter_width = int(orig_width * resize_factor)
+#     inter_height = int(orig_height * resize_factor)
+#     target_fps = 8
+#     ideal_skip = max(0, math.ceil(orig_fps / target_fps) - 1)
+#     skip = min(5, ideal_skip)  # Cap at 5
+#     while (total_frames / (skip + 1)) < 49 and skip > 0:
+#         skip -= 1
+#     processed_frames = []
+#     frame_count = 0
+#     total_read = 0
+#     while frame_count < 49 and total_read < total_frames:
+#         ret, frame = cap.read()
+#         if not ret:
+#             break
+#         if total_read % (skip + 1) == 0:
+#             resized = cv2.resize(frame, (inter_width, inter_height), interpolation=cv2.INTER_AREA)
+#             start_x = (inter_width - target_width) // 2
+#             start_y = (inter_height - target_height) // 2
+#             cropped = resized[start_y : start_y + target_height, start_x : start_x + target_width]
+#             processed_frames.append(cropped)
+#             frame_count += 1
+#         total_read += 1
+#     cap.release()
+#     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
+#         temp_video_path = temp_file.name
+#         fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+#         out = cv2.VideoWriter(temp_video_path, fourcc, target_fps, (target_width, target_height))
+#         for frame in processed_frames:
+#             out.write(frame)
+#         out.release()
+#     return temp_video_path
 # def convert_prompt(prompt: str, retry_times: int = 3) -> str:
 def infer(
     prompt: str,
+    # image_input: str,
+    # video_input: str,
+    # video_strenght: float,
     num_inference_steps: int,
     guidance_scale: float,
     seed: int = -1,
     if seed == -1:
         seed = random.randint(0, 2**8 - 1)
+    # if video_input is not None:
+    #     video = load_video(video_input)[:49]  # Limit to 49 frames
+    #     pipe_video = CogVideoXVideoToVideoPipeline.from_pretrained(
+    #         "THUDM/CogVideoX-5B",
+    #         transformer=transformer,
+    #         vae=vae,
+    #         scheduler=pipe.scheduler,
+    #         tokenizer=pipe.tokenizer,
+    #         text_encoder=text_encoder,
+    #         torch_dtype=torch.bfloat16,
+    #     ).to(device)
+    #     # pipe_video.enable_model_cpu_offload()
+    #     pipe_video.vae.enable_tiling()
+    #     pipe_video.vae.enable_slicing()
+    #     video_pt = pipe_video(
+    #         video=video,
+    #         prompt=prompt,
+    #         num_inference_steps=num_inference_steps,
+    #         num_videos_per_prompt=1,
+    #         strength=video_strenght,
+    #         use_dynamic_cfg=True,
+    #         output_type="pt",
+    #         guidance_scale=guidance_scale,
+    #         generator=torch.Generator(device="cpu").manual_seed(seed),
+    #     ).frames
+    #     pipe_video.to("cpu")
+    #     del pipe_video
+    #     gc.collect()
+    #     torch.cuda.empty_cache()
+    # elif image_input is not None:
+    #     pipe_image = CogVideoXImageToVideoPipeline.from_pretrained(
+    #         "THUDM/CogVideoX-5B-I2V",
+    #         transformer=i2v_transformer,
+    #         vae=i2v_vae,
+    #         scheduler=pipe.scheduler,
+    #         tokenizer=pipe.tokenizer,
+    #         text_encoder=i2v_text_encoder,
+    #         torch_dtype=torch.bfloat16,
+    #     ).to(device)
+    #     image_input = Image.fromarray(image_input).resize(size=(720, 480))  # Convert to PIL
+    #     image = load_image(image_input)
+    #     video_pt = pipe_image(
+    #         image=image,
+    #         prompt=prompt,
+    #         num_inference_steps=num_inference_steps,
+    #         num_videos_per_prompt=1,
+    #         use_dynamic_cfg=True,
+    #         output_type="pt",
+    #         guidance_scale=guidance_scale,
+    #         generator=torch.Generator(device="cpu").manual_seed(seed),
+    #     ).frames
+    #     pipe_image.to("cpu")
+    #     del pipe_image
+    #     gc.collect()
+    #     torch.cuda.empty_cache()
+    # else:
+    pipe.to("cpu")
+    video_pt = pipe(
+        prompt=prompt,
+        num_videos_per_prompt=1,
+        num_inference_steps=num_inference_steps,
+        num_frames=16,
+        use_dynamic_cfg=True,
+        output_type="pt",
+        guidance_scale=guidance_scale,
+        generator=torch.Generator(device="cpu").manual_seed(seed),
+    ).frames
+    pipe.to("cpu")
+    gc.collect()
     return (video_pt, seed)
            """)
     with gr.Row():
         with gr.Column():
+            # with gr.Accordion("I2V: Image Input (cannot be used simultaneously with video input)", open=False):
+            #     image_input = gr.Image(label="Input Image (will be cropped to 720 * 480)")
+            #     examples_component_images = gr.Examples(examples_images, inputs=[image_input], cache_examples=False)
+            # with gr.Accordion("V2V: Video Input (cannot be used simultaneously with image input)", open=False):
+            #     video_input = gr.Video(label="Input Video (will be cropped to 49 frames, 6 seconds at 8fps)")
+            #     strength = gr.Slider(0.1, 1.0, value=0.8, step=0.01, label="Strength")
+            #     examples_component_videos = gr.Examples(examples_videos, inputs=[video_input], cache_examples=False)
             prompt = gr.Textbox(label="Prompt (Less than 200 Words)", placeholder="Enter your prompt here", lines=5)
             # with gr.Row():
     @spaces.GPU(duration=120)
     def generate(
         prompt,
+        # image_input,
+        # video_input,
+        # video_strength,
         seed_value,
         # scale_status,
         # rife_status,
     ):
         latents, seed = infer(
             prompt,
+            # image_input,
+            # video_input,
+            # video_strength,
+            num_inference_steps=50,  # NOT Changed
             guidance_scale=7.0,  # NOT Changed
             seed=seed_value,
             progress=progress,
     generate_button.click(
         generate,
+        inputs=[prompt, seed_param],
+        # inputs=[prompt, image_input, video_input, strength, seed_param],
         # inputs=[prompt, image_input, video_input, strength, seed_param, enable_scale, enable_rife],
         outputs=[video_output, download_video_button, download_gif_button, seed_text],
     )
     # enhance_button.click(enhance_prompt_func, inputs=[prompt], outputs=[prompt])
+    # video_input.upload(resize_if_unfit, inputs=[video_input], outputs=[video_input])
 if __name__ == "__main__":
     utils.install_packages()