Wan-2.2-Enhanced

Running on Zero

App Files Files Community

ginipick commited on Aug 4

Commit

30b4e47

verified ·

1 Parent(s): 52f499a

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -37

app.py CHANGED Viewed

@@ -56,6 +56,13 @@ pipeline = wan.WanTI2V(
 print("Pipeline initialized and ready.")
 # --- Helper Functions ---
 def select_best_size_for_image(image, available_sizes):
     """Select the size option with aspect ratio closest to the input image."""
     if image is None:
@@ -90,6 +97,23 @@ def handle_image_upload(image):
     return gr.update(value=best_size)
 def get_duration(image,
                  prompt,
                  size,
@@ -107,6 +131,14 @@ def get_duration(image,
     else:
         return 90
 # --- 2. Gradio Inference Function ---
 @spaces.GPU(duration=get_duration)
 def generate_video(
@@ -121,9 +153,18 @@ def generate_video(
     progress=gr.Progress(track_tqdm=True)
 ):
     """The main function to generate video, called by the Gradio interface."""
     if seed == -1:
         seed = random.randint(0, sys.maxsize)
     input_image = None
     if image is not None:
         input_image = Image.fromarray(image).convert("RGB")
@@ -134,44 +175,110 @@ def generate_video(
     # Calculate number of frames based on duration
     num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
-    video_tensor = pipeline.generate(
-        input_prompt=prompt,
-        img=input_image,  # Pass None for T2V, Image for I2V
-        size=SIZE_CONFIGS[size],
-        max_area=MAX_AREA_CONFIGS[size],
-        frame_num=num_frames,  # Use calculated frames instead of cfg.frame_num
-        shift=shift,
-        sample_solver='unipc',
-        sampling_steps=int(sampling_steps),
-        guide_scale=guide_scale,
-        seed=seed,
-        offload_model=True
-    )
-    # Save the video to a temporary file
-    video_path = cache_video(
-        tensor=video_tensor[None],  # Add a batch dimension
-        save_file=None,  # cache_video will create a temp file
-        fps=cfg.sample_fps,
-        normalize=True,
-        value_range=(-1, 1)
-    )
-    del video_tensor
-    gc.collect()
     return video_path
 # --- 3. Gradio Interface ---
-css = ".gradio-container {max-width: 1100px !important; margin: 0 auto} #output_video {height: 500px;} #input_image {height: 500px;}"
 with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
-    gr.Markdown("# Wan 2.2 TI2V 5B")
-    gr.Markdown("generate high quality videos using **Wan 2.2 5B Text-Image-to-Video model**,[[model]](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B),[[paper]](https://arxiv.org/abs/2503.20314)")
     with gr.Row():
         with gr.Column(scale=2):
             image_input = gr.Image(type="numpy", label="Input Image (Optional)", elem_id="input_image")
-            prompt_input = gr.Textbox(label="Prompt", value="A beautiful waterfall in a lush jungle, cinematic.", lines=3)
             duration_input = gr.Slider(
                 minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
                 maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
@@ -180,18 +287,57 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
                 label="Duration (seconds)",
                 info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
             )
-            size_input = gr.Dropdown(label="Output Resolution", choices=list(SUPPORTED_SIZES[TASK_NAME]), value="704*1280")
         with gr.Column(scale=2):
             video_output = gr.Video(label="Generated Video", elem_id="output_video")
             with gr.Accordion("Advanced Settings", open=False):
-                steps_input = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=38, step=1)
-                scale_input = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=cfg.sample_guide_scale, step=0.1)
-                shift_input = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
-                seed_input = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
-            run_button = gr.Button("Generate Video", variant="primary")
     # Add image upload handler
     image_input.upload(
@@ -206,12 +352,25 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
         outputs=[size_input]
     )
     example_image_path = os.path.join(os.path.dirname(__file__), "examples/i2v_input.JPG")
     gr.Examples(
         examples=[
-            [example_image_path, "The cat removes the glasses from its eyes.", "1280*704", 1.5],
-            [None, "A cinematic shot of a boat sailing on a calm sea at sunset.", "1280*704", 2.0],
-            [None, "Drone footage flying over a futuristic city with flying cars.", "1280*704", 2.0],
         ],
         inputs=[image_input, prompt_input, size_input, duration_input],
         outputs=video_output,

 print("Pipeline initialized and ready.")
 # --- Helper Functions ---
+def clear_gpu_memory():
+    """Clear GPU memory more thoroughly"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    gc.collect()
 def select_best_size_for_image(image, available_sizes):
     """Select the size option with aspect ratio closest to the input image."""
     if image is None:
     return gr.update(value=best_size)
+def validate_inputs(image, prompt, duration_seconds):
+    """Validate user inputs"""
+    errors = []
+    if not prompt or len(prompt.strip()) < 5:
+        errors.append("Prompt must be at least 5 characters long.")
+    if image is not None:
+        img = Image.fromarray(image)
+        if img.size[0] * img.size[1] > 4096 * 4096:
+            errors.append("Image size is too large (maximum 4096x4096).")
+    if duration_seconds > 5.0 and image is None:
+        errors.append("Videos longer than 5 seconds require an input image.")
+    return errors
 def get_duration(image,
                  prompt,
                  size,
     else:
         return 90
+def apply_template(template, current_prompt):
+    """Apply prompt template"""
+    if "{subject}" in template:
+        # Extract the main subject from current prompt (simple heuristic)
+        subject = current_prompt.split(",")[0] if "," in current_prompt else current_prompt
+        return template.replace("{subject}", subject)
+    return template + " " + current_prompt
 # --- 2. Gradio Inference Function ---
 @spaces.GPU(duration=get_duration)
 def generate_video(
     progress=gr.Progress(track_tqdm=True)
 ):
     """The main function to generate video, called by the Gradio interface."""
+    # Validate inputs
+    errors = validate_inputs(image, prompt, duration_seconds)
+    if errors:
+        raise gr.Error("\n".join(errors))
+    progress(0, desc="Setting up...")
     if seed == -1:
         seed = random.randint(0, sys.maxsize)
+    progress(0.1, desc="Processing image...")
     input_image = None
     if image is not None:
         input_image = Image.fromarray(image).convert("RGB")
     # Calculate number of frames based on duration
     num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
+    progress(0.2, desc="Generating video...")
+    try:
+        video_tensor = pipeline.generate(
+            input_prompt=prompt,
+            img=input_image,  # Pass None for T2V, Image for I2V
+            size=SIZE_CONFIGS[size],
+            max_area=MAX_AREA_CONFIGS[size],
+            frame_num=num_frames,  # Use calculated frames instead of cfg.frame_num
+            shift=shift,
+            sample_solver='unipc',
+            sampling_steps=int(sampling_steps),
+            guide_scale=guide_scale,
+            seed=seed,
+            offload_model=True
+        )
+        progress(0.9, desc="Saving video...")
+        # Save the video to a temporary file
+        video_path = cache_video(
+            tensor=video_tensor[None],  # Add a batch dimension
+            save_file=None,  # cache_video will create a temp file
+            fps=cfg.sample_fps,
+            normalize=True,
+            value_range=(-1, 1)
+        )
+        progress(1.0, desc="Complete!")
+    except torch.cuda.OutOfMemoryError:
+        clear_gpu_memory()
+        raise gr.Error("GPU out of memory. Please try with lower settings.")
+    except Exception as e:
+        raise gr.Error(f"Video generation failed: {str(e)}")
+    finally:
+        if 'video_tensor' in locals():
+            del video_tensor
+        clear_gpu_memory()
     return video_path
 # --- 3. Gradio Interface ---
+css = """
+.gradio-container {max-width: 1100px !important; margin: 0 auto}
+#output_video {height: 500px;}
+#input_image {height: 500px;}
+.template-btn {margin: 2px !important;}
+"""
+# Default prompt with motion emphasis
+DEFAULT_PROMPT = "Generate a video with smooth and natural movement. Objects should have visible motion while maintaining fluid transitions."
+# Prompt templates
+templates = {
+    "Cinematic": "cinematic shot of {subject}, professional lighting, smooth camera movement, 4k quality",
+    "Animation": "animated style {subject}, vibrant colors, fluid motion, dynamic movement",
+    "Nature": "nature documentary footage of {subject}, wildlife photography, natural movement",
+    "Slow Motion": "slow motion capture of {subject}, high speed camera, detailed motion",
+    "Action": "dynamic action shot of {subject}, fast paced movement, energetic motion"
+}
 with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
+    gr.Markdown("""
+    # Wan 2.2 TI2V Enhanced
+    Generate high quality videos using **Wan 2.2 5B Text-Image-to-Video model**
+    [[model]](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B), [[paper]](https://arxiv.org/abs/2503.20314)
+    ### 💡 Tips for best results:
+    - 🖼️ Upload an image for better control over the video content
+    - ⏱️ Longer videos require more processing time
+    - 🎯 Be specific and descriptive in your prompts
+    - 🎬 Include motion-related keywords for dynamic videos
+    """)
     with gr.Row():
         with gr.Column(scale=2):
             image_input = gr.Image(type="numpy", label="Input Image (Optional)", elem_id="input_image")
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                value=DEFAULT_PROMPT,
+                lines=3,
+                placeholder="Describe the video you want to generate..."
+            )
+            # Prompt templates section
+            with gr.Accordion("Prompt Templates", open=False):
+                gr.Markdown("Click a template to apply it to your prompt:")
+                with gr.Row():
+                    template_buttons = {}
+                    for name, template in templates.items():
+                        btn = gr.Button(name, size="sm", elem_classes=["template-btn"])
+                        template_buttons[name] = (btn, template)
+                # Connect template buttons
+                for name, (btn, template) in template_buttons.items():
+                    btn.click(
+                        fn=lambda t=template, p=prompt_input: apply_template(t, p),
+                        inputs=[prompt_input],
+                        outputs=prompt_input
+                    )
             duration_input = gr.Slider(
                 minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
                 maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
                 label="Duration (seconds)",
                 info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
             )
+            size_input = gr.Dropdown(
+                label="Output Resolution",
+                choices=list(SUPPORTED_SIZES[TASK_NAME]),
+                value="704*1280"
+            )
         with gr.Column(scale=2):
             video_output = gr.Video(label="Generated Video", elem_id="output_video")
+            # Status indicators
+            with gr.Row():
+                status_text = gr.Textbox(
+                    label="Status",
+                    value="Ready",
+                    interactive=False,
+                    max_lines=1
+                )
             with gr.Accordion("Advanced Settings", open=False):
+                steps_input = gr.Slider(
+                    label="Sampling Steps",
+                    minimum=10,
+                    maximum=50,
+                    value=38,
+                    step=1,
+                    info="Higher values = better quality but slower"
+                )
+                scale_input = gr.Slider(
+                    label="Guidance Scale",
+                    minimum=1.0,
+                    maximum=10.0,
+                    value=cfg.sample_guide_scale,
+                    step=0.1,
+                    info="Higher values = closer to prompt but less creative"
+                )
+                shift_input = gr.Slider(
+                    label="Sample Shift",
+                    minimum=1.0,
+                    maximum=20.0,
+                    value=cfg.sample_shift,
+                    step=0.1,
+                    info="Affects the sampling process dynamics"
+                )
+                seed_input = gr.Number(
+                    label="Seed (-1 for random)",
+                    value=-1,
+                    precision=0,
+                    info="Use same seed for reproducible results"
+                )
+            run_button = gr.Button("Generate Video", variant="primary", size="lg")
     # Add image upload handler
     image_input.upload(
         outputs=[size_input]
     )
+    # Update status when generating
+    def update_status_and_generate(*args):
+        status_text.value = "Generating..."
+        try:
+            result = generate_video(*args)
+            status_text.value = "Complete!"
+            return result
+        except Exception as e:
+            status_text.value = "Error occurred"
+            raise e
     example_image_path = os.path.join(os.path.dirname(__file__), "examples/i2v_input.JPG")
     gr.Examples(
         examples=[
+            [example_image_path, "The cat removes the glasses from its eyes with smooth motion.", "1280*704", 1.5],
+            [None, "A cinematic shot of a boat sailing on calm waves with gentle rocking motion at sunset.", "1280*704", 2.0],
+            [None, "Drone footage flying smoothly over a futuristic city with flying cars in continuous motion.", "1280*704", 2.0],
+            [None, DEFAULT_PROMPT + " A waterfall cascading down rocks.", "704*1280", 2.5],
+            [None, DEFAULT_PROMPT + " Birds flying across a cloudy sky.", "1280*704", 3.0],
         ],
         inputs=[image_input, prompt_input, size_input, duration_input],
         outputs=video_output,