text2video / app.py
ozilion's picture
Update app.py
c40d82c verified
raw
history blame
23.6 kB
import gradio as gr
import torch
import os
import gc
import numpy as np
import tempfile
from typing import Optional, Tuple
import time
# ZeroGPU support
try:
import spaces
SPACES_AVAILABLE = True
except ImportError:
SPACES_AVAILABLE = False
class spaces:
@staticmethod
def GPU(duration=300):
def decorator(func): return func
return decorator
# Environment
IS_ZERO_GPU = os.environ.get("SPACES_ZERO_GPU") == "true"
IS_SPACES = os.environ.get("SPACE_ID") is not None
HAS_CUDA = torch.cuda.is_available()
print(f"πŸš€ H200 Premium Setup: ZeroGPU={IS_ZERO_GPU}, Spaces={IS_SPACES}, CUDA={HAS_CUDA}")
# PREMIUM MODELS ONLY - No low quality fallbacks
PREMIUM_MODELS = [
{
"id": "THUDM/CogVideoX-5b",
"name": "CogVideoX-5B",
"pipeline_class": "CogVideoXPipeline",
"resolution_options": [(720, 480), (480, 720)],
"max_frames": 49,
"dtype": torch.bfloat16,
"fps": 8,
"priority": 1,
"description": "5B parameter video model - high quality"
},
{
"id": "THUDM/CogVideoX-2b",
"name": "CogVideoX-2B",
"pipeline_class": "CogVideoXPipeline",
"resolution_options": [(720, 480), (480, 720)],
"max_frames": 49,
"dtype": torch.bfloat16,
"fps": 8,
"priority": 2,
"description": "2B parameter model - faster generation"
},
{
"id": "Lightricks/LTX-Video",
"name": "LTX-Video",
"pipeline_class": "DiffusionPipeline",
"resolution_options": [(512, 512), (768, 768)],
"max_frames": 121, # LTX supports longer videos
"dtype": torch.bfloat16,
"fps": 24, # Higher FPS
"priority": 3,
"description": "Professional video generation model"
}
]
# Global variables
MODEL = None
MODEL_INFO = None
LOADING_LOGS = []
def log_loading(message):
"""Enhanced logging with timestamps"""
global LOADING_LOGS
timestamp = time.strftime('%H:%M:%S')
formatted_msg = f"[{timestamp}] {message}"
print(formatted_msg)
LOADING_LOGS.append(formatted_msg)
def get_h200_memory():
"""Get detailed H200 memory stats"""
if HAS_CUDA:
try:
total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
allocated = torch.cuda.memory_allocated(0) / (1024**3)
reserved = torch.cuda.memory_reserved(0) / (1024**3)
return total, allocated, reserved
except:
return 0, 0, 0
return 0, 0, 0
def load_premium_model():
"""Load premium models only - no fallbacks"""
global MODEL, MODEL_INFO, LOADING_LOGS
if MODEL is not None:
return True
LOADING_LOGS = []
log_loading("🎯 H200 Premium Model Loading - QUALITY PRIORITY")
total_mem, allocated_mem, reserved_mem = get_h200_memory()
log_loading(f"πŸ’Ύ H200 Memory: {total_mem:.1f}GB total, {allocated_mem:.1f}GB allocated, {reserved_mem:.1f}GB reserved")
# Sort by priority (premium first)
sorted_models = sorted(PREMIUM_MODELS, key=lambda x: x["priority"])
for model_config in sorted_models:
if try_load_premium_model(model_config):
return True
log_loading("❌ All premium models failed - check model availability")
return False
def try_load_premium_model(config):
"""Try loading premium model with optimized settings"""
global MODEL, MODEL_INFO
model_id = config["id"]
model_name = config["name"]
log_loading(f"πŸ”„ Loading {model_name} (Premium)...")
log_loading(f" πŸ“‹ Target: {config['pipeline_class']}, {config['max_frames']} frames, {config['fps']} fps")
try:
# Clear H200 memory
if HAS_CUDA:
torch.cuda.empty_cache()
torch.cuda.synchronize()
gc.collect()
# Import specific pipeline
if config["pipeline_class"] == "CogVideoXPipeline":
from diffusers import CogVideoXPipeline
PipelineClass = CogVideoXPipeline
log_loading(f" πŸ“₯ Using CogVideoXPipeline...")
else:
from diffusers import DiffusionPipeline
PipelineClass = DiffusionPipeline
log_loading(f" πŸ“₯ Using DiffusionPipeline...")
# Load with premium settings
log_loading(f" πŸ”„ Downloading/Loading model...")
pipe = PipelineClass.from_pretrained(
model_id,
torch_dtype=config["dtype"],
trust_remote_code=True,
# No variant, no use_safetensors restrictions
)
# Move to H200 and optimize
if HAS_CUDA:
log_loading(f" πŸ“± Moving to H200 CUDA...")
pipe = pipe.to("cuda")
# Premium optimizations for H200's 69.5GB
if hasattr(pipe, 'enable_vae_slicing'):
pipe.enable_vae_slicing()
log_loading(f" ⚑ VAE slicing enabled")
if hasattr(pipe, 'enable_vae_tiling'):
pipe.enable_vae_tiling()
log_loading(f" ⚑ VAE tiling enabled")
if hasattr(pipe, 'enable_memory_efficient_attention'):
pipe.enable_memory_efficient_attention()
log_loading(f" ⚑ Memory efficient attention enabled")
# For H200's large memory, keep everything in GPU
log_loading(f" πŸš€ Keeping full model in H200 GPU memory")
# Memory check after loading
total_mem, allocated_mem, reserved_mem = get_h200_memory()
log_loading(f" πŸ’Ύ Post-load: {allocated_mem:.1f}GB allocated, {reserved_mem:.1f}GB reserved")
# Validate model capabilities
expected_frames = config["max_frames"]
expected_fps = config["fps"]
log_loading(f" βœ… {model_name} ready: {expected_frames} max frames @ {expected_fps} fps")
MODEL = pipe
MODEL_INFO = config
log_loading(f"🎯 SUCCESS: {model_name} loaded for premium generation!")
return True
except Exception as e:
log_loading(f"❌ {model_name} failed: {str(e)}")
# Clear memory thoroughly
if HAS_CUDA:
torch.cuda.empty_cache()
torch.cuda.synchronize()
gc.collect()
return False
@spaces.GPU(duration=300) if SPACES_AVAILABLE else lambda x: x
def generate_premium_video(
prompt: str,
negative_prompt: str = "",
num_frames: int = 49,
resolution: str = "720x480",
num_inference_steps: int = 50,
guidance_scale: float = 6.0,
seed: int = -1
) -> Tuple[Optional[str], str]:
"""Generate premium quality video with proper parameters"""
global MODEL, MODEL_INFO
# Load premium model
if not load_premium_model():
logs = "\n".join(LOADING_LOGS[-5:])
return None, f"❌ No premium models available\n\nLogs:\n{logs}"
# Input validation
if not prompt.strip():
return None, "❌ Please enter a detailed prompt for premium generation."
if len(prompt) < 10:
return None, "❌ Please provide a more detailed prompt (minimum 10 characters)."
# Parse resolution
try:
width, height = map(int, resolution.split('x'))
except:
width, height = MODEL_INFO["resolution_options"][0]
# Validate resolution
if (width, height) not in MODEL_INFO["resolution_options"]:
width, height = MODEL_INFO["resolution_options"][0]
log_loading(f"⚠️ Resolution adjusted to {width}x{height}")
# Validate frames
max_frames = MODEL_INFO["max_frames"]
num_frames = min(max(num_frames, 16), max_frames) # Minimum 16 for quality
# Model-specific parameter optimization
if MODEL_INFO["name"].startswith("CogVideoX"):
# CogVideoX optimal parameters
guidance_scale = max(6.0, min(guidance_scale, 7.0)) # CogVideoX sweet spot
num_inference_steps = max(50, num_inference_steps) # Higher steps for quality
elif MODEL_INFO["name"] == "LTX-Video":
# LTX-Video optimal parameters
guidance_scale = max(7.0, min(guidance_scale, 8.5)) # LTX sweet spot
num_inference_steps = max(30, num_inference_steps)
try:
# H200 memory preparation
start_memory = torch.cuda.memory_allocated(0) / (1024**3) if HAS_CUDA else 0
# Enhanced seed handling
if seed == -1:
seed = np.random.randint(0, 2**32 - 1)
device = "cuda" if HAS_CUDA else "cpu"
generator = torch.Generator(device=device).manual_seed(seed)
log_loading(f"🎬 PREMIUM GENERATION START")
log_loading(f"πŸ“‹ Model: {MODEL_INFO['name']}")
log_loading(f"πŸ“ Resolution: {width}x{height}")
log_loading(f"🎞️ Frames: {num_frames} @ {MODEL_INFO['fps']} fps = {num_frames/MODEL_INFO['fps']:.1f}s video")
log_loading(f"βš™οΈ Steps: {num_inference_steps}, Guidance: {guidance_scale}")
log_loading(f"πŸ“ Prompt: {prompt[:100]}...")
start_time = time.time()
# Premium generation with optimal autocast
with torch.autocast(device, dtype=MODEL_INFO["dtype"], enabled=HAS_CUDA):
# Prepare generation parameters
gen_kwargs = {
"prompt": prompt,
"height": height,
"width": width,
"num_frames": num_frames,
"num_inference_steps": num_inference_steps,
"guidance_scale": guidance_scale,
"generator": generator,
}
# Add negative prompt for quality
if negative_prompt.strip():
gen_kwargs["negative_prompt"] = negative_prompt
else:
# Default negative prompt for premium quality
default_negative = "blurry, low quality, distorted, pixelated, compression artifacts, watermark, text, signature, amateur, static, boring"
gen_kwargs["negative_prompt"] = default_negative
log_loading(f"🚫 Using default negative prompt for quality")
# Model-specific parameters
if MODEL_INFO["name"].startswith("CogVideoX"):
gen_kwargs["num_videos_per_prompt"] = 1
log_loading(f"πŸŽ₯ CogVideoX generation starting...")
# Generate with progress
log_loading(f"πŸš€ H200 generation in progress...")
result = MODEL(**gen_kwargs)
end_time = time.time()
generation_time = end_time - start_time
# Extract video frames
if hasattr(result, 'frames'):
video_frames = result.frames[0]
log_loading(f"πŸ“Ή Extracted {len(video_frames)} frames")
elif hasattr(result, 'videos'):
video_frames = result.videos[0]
log_loading(f"πŸ“Ή Extracted video tensor: {video_frames.shape}")
else:
log_loading(f"❌ Unknown result format: {type(result)}")
return None, "❌ Could not extract video frames from result"
# Export with proper FPS
target_fps = MODEL_INFO["fps"]
actual_duration = num_frames / target_fps
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file:
from diffusers.utils import export_to_video
export_to_video(video_frames, tmp_file.name, fps=target_fps)
video_path = tmp_file.name
log_loading(f"🎬 Exported to {tmp_file.name} @ {target_fps} fps")
# Memory stats
end_memory = torch.cuda.memory_allocated(0) / (1024**3) if HAS_CUDA else 0
memory_used = end_memory - start_memory
# Success report
success_msg = f"""🎯 **PREMIUM H200 VIDEO GENERATED**
πŸ€– **Model:** {MODEL_INFO['name']}
πŸ“ **Prompt:** {prompt}
🎬 **Video:** {num_frames} frames @ {target_fps} fps = **{actual_duration:.1f} seconds**
πŸ“ **Resolution:** {width}x{height}
βš™οΈ **Quality:** {num_inference_steps} inference steps
🎯 **Guidance:** {guidance_scale}
🎲 **Seed:** {seed}
⏱️ **Generation Time:** {generation_time:.1f}s ({generation_time/60:.1f} minutes)
πŸ–₯️ **Device:** H200 MIG (69.5GB)
πŸ’Ύ **Memory Used:** {memory_used:.1f}GB
πŸ“‹ **Model Notes:** {MODEL_INFO['description']}
**πŸŽ₯ Video Quality:** Premium quality with {num_frames} frames over {actual_duration:.1f} seconds"""
log_loading(f"βœ… PREMIUM generation completed: {actual_duration:.1f}s video in {generation_time:.1f}s")
return video_path, success_msg
except torch.cuda.OutOfMemoryError:
if HAS_CUDA:
torch.cuda.empty_cache()
gc.collect()
return None, "❌ H200 memory exceeded. Try reducing frames or resolution."
except Exception as e:
if HAS_CUDA:
torch.cuda.empty_cache()
gc.collect()
error_msg = str(e)
log_loading(f"❌ Generation error: {error_msg}")
return None, f"❌ Premium generation failed: {error_msg}"
def get_model_status():
"""Get current premium model status"""
if MODEL is None:
return "⏳ **No premium model loaded** - will auto-load on generation"
fps = MODEL_INFO["fps"]
max_frames = MODEL_INFO["max_frames"]
max_duration = max_frames / fps
resolutions = ", ".join([f"{w}x{h}" for w, h in MODEL_INFO["resolution_options"]])
return f"""🎯 **{MODEL_INFO['name']} Ready**
**πŸ“‹ Premium Capabilities:**
- **Max Duration:** {max_duration:.1f} seconds ({max_frames} frames @ {fps} fps)
- **Resolutions:** {resolutions}
- **Quality:** {MODEL_INFO['description']}
**⚑ H200 Optimizations:**
- Full model in GPU memory
- Memory efficient attention
- VAE optimizations enabled
**πŸ’‘ This model produces {max_duration:.1f} second videos with {max_frames} frames!**"""
def get_loading_logs():
"""Get formatted loading logs"""
global LOADING_LOGS
if not LOADING_LOGS:
return "No loading attempts yet."
return "\n".join(LOADING_LOGS)
def suggest_premium_settings():
"""Suggest optimal settings for current model"""
if MODEL is None:
return "Load a premium model first."
model_name = MODEL_INFO['name']
max_frames = MODEL_INFO['max_frames']
fps = MODEL_INFO['fps']
max_duration = max_frames / fps
return f"""## 🎯 Optimal Settings for {model_name}
**πŸš€ Maximum Quality:**
- Frames: {max_frames} (full {max_duration:.1f} second video)
- Inference Steps: 50+
- Guidance Scale: {6.0 if 'CogVideo' in model_name else 7.5}
- Resolution: {MODEL_INFO['resolution_options'][-1]}
**βš–οΈ Balanced (Recommended):**
- Frames: {max_frames//2} ({max_frames//2/fps:.1f} second video)
- Inference Steps: 35-50
- Guidance Scale: {6.0 if 'CogVideo' in model_name else 7.5}
**⚑ Fast Test:**
- Frames: 25 ({25/fps:.1f} second video)
- Inference Steps: 30
- Guidance Scale: {6.0 if 'CogVideo' in model_name else 7.5}
**πŸ“ Premium Prompting Tips:**
- Be very specific and detailed
- Include camera movements: "slow zoom", "tracking shot"
- Describe lighting: "golden hour", "cinematic lighting"
- Add style: "professional cinematography", "8K quality"
- Mention motion: "smooth movement", "graceful motion"
**Example Premium Prompt:**
"A majestic golden eagle soaring gracefully through misty mountain peaks during golden hour, cinematic tracking shot with shallow depth of field, professional wildlife cinematography, smooth gliding motion, warm sunset lighting, 8K quality"
Remember: Longer videos need more detailed prompts to maintain coherence!"""
# Create premium interface
with gr.Blocks(title="H200 Premium Video Generator", theme=gr.themes.Glass()) as demo:
gr.Markdown("""
# 🎯 H200 Premium Video Generator
**Premium Models Only** β€’ **Long-Form Videos** β€’ **Professional Quality**
*CogVideoX-5B β€’ LTX-Video β€’ No Low-Quality Fallbacks*
""")
# Premium status
with gr.Row():
gr.Markdown("""
<div style="background: linear-gradient(45deg, #FFD700, #FF6B6B); padding: 15px; border-radius: 15px; text-align: center; color: white; font-weight: bold; font-size: 18px;">
πŸ† PREMIUM MODE - H200 MIG 69.5GB - QUALITY PRIORITY πŸ†
</div>
""")
with gr.Tab("🎬 Premium Generation"):
with gr.Row():
with gr.Column(scale=1):
prompt_input = gr.Textbox(
label="πŸ“ Detailed Video Prompt (Premium Quality)",
placeholder="A breathtaking aerial view of a majestic golden eagle soaring gracefully through dramatic mountain peaks shrouded in morning mist, cinematic wildlife documentary style with slow motion tracking shot, professional cinematography with warm golden hour lighting and shallow depth of field, smooth gliding motion across epic landscape, 8K quality with film grain texture...",
lines=5,
max_lines=8
)
negative_prompt_input = gr.Textbox(
label="🚫 Negative Prompt (Optional - auto-applied for quality)",
placeholder="blurry, low quality, distorted, pixelated, compression artifacts, watermark, text, signature, amateur, static, boring, jerky motion...",
lines=2
)
with gr.Accordion("🎯 Premium Settings", open=True):
with gr.Row():
num_frames = gr.Slider(
minimum=16,
maximum=49,
value=49,
step=1,
label="🎬 Video Frames (16 = 2s, 49 = 6s+)"
)
resolution = gr.Dropdown(
choices=["720x480", "480x720"],
value="720x480",
label="πŸ“ Resolution"
)
with gr.Row():
num_steps = gr.Slider(
minimum=30,
maximum=100,
value=50,
step=5,
label="βš™οΈ Inference Steps (50+ for premium quality)"
)
guidance_scale = gr.Slider(
minimum=4.0,
maximum=10.0,
value=6.0,
step=0.5,
label="🎯 Guidance Scale"
)
seed = gr.Number(
label="🎲 Seed (-1 for random)",
value=-1,
precision=0
)
generate_btn = gr.Button(
"🎯 Generate Premium Video",
variant="primary",
size="lg"
)
gr.Markdown("""
**⏱️ Premium Generation:** 2-5 minutes for quality
**πŸŽ₯ Output:** 2-6+ second high-quality videos
**πŸ’‘ Premium Tips:**
- Use very detailed, specific prompts
- Higher inference steps = better quality
- Longer videos need more descriptive prompts
""")
with gr.Column(scale=1):
video_output = gr.Video(
label="πŸŽ₯ Premium H200 Generated Video",
height=400
)
result_text = gr.Textbox(
label="πŸ“‹ Premium Generation Report",
lines=12,
show_copy_button=True
)
# Generate button
generate_btn.click(
fn=generate_premium_video,
inputs=[
prompt_input, negative_prompt_input, num_frames,
resolution, num_steps, guidance_scale, seed
],
outputs=[video_output, result_text]
)
# Premium examples
gr.Examples(
examples=[
[
"A majestic golden eagle soaring gracefully through misty mountain peaks during golden hour, cinematic wildlife documentary style with slow motion tracking shot, professional cinematography with warm lighting and shallow depth of field, smooth gliding motion, 8K quality",
"blurry, low quality, static, amateur, pixelated",
49, "720x480", 50, 6.0, 42
],
[
"Ocean waves crashing against dramatic coastal cliffs during a storm, professional seascape cinematography with dynamic camera movement, slow motion water spray and foam, dramatic lighting with storm clouds, high contrast and deep blues, cinematic quality",
"calm, peaceful, low quality, static, boring",
41, "720x480", 60, 6.5, 123
],
[
"A steaming artisanal coffee cup on rustic wooden table by rain-streaked window, cozy cafe atmosphere with warm ambient lighting, shallow depth of field with bokeh background, steam rising elegantly, professional commercial cinematography, intimate close-up shot",
"cold, harsh lighting, plastic, fake, low quality, distorted",
33, "720x480", 45, 6.0, 456
],
[
"Time-lapse of cherry blossom petals falling like snow in traditional Japanese garden with wooden bridge over koi pond, peaceful zen atmosphere with soft natural lighting, seasonal transition captured in cinematic wide shot, perfect composition and color grading",
"modern, urban, chaotic, low quality, static, artificial",
49, "720x480", 55, 6.5, 789
]
],
inputs=[prompt_input, negative_prompt_input, num_frames, resolution, num_steps, guidance_scale, seed]
)
with gr.Tab("🎯 Premium Status"):
with gr.Row():
status_btn = gr.Button("πŸ” Model Status", variant="secondary")
logs_btn = gr.Button("πŸ“‹ Loading Logs", variant="secondary")
settings_btn = gr.Button("βš™οΈ Optimal Settings", variant="secondary")
status_output = gr.Markdown()
logs_output = gr.Textbox(label="Detailed Logs", lines=12, show_copy_button=True)
settings_output = gr.Markdown()
status_btn.click(fn=get_model_status, outputs=status_output)
logs_btn.click(fn=get_loading_logs, outputs=logs_output)
settings_btn.click(fn=suggest_premium_settings, outputs=settings_output)
# Auto-load status
demo.load(fn=get_model_status, outputs=status_output)
if __name__ == "__main__":
demo.queue(max_size=2) # Premium quality needs smaller queue
demo.launch(
share=False,
server_name="0.0.0.0",
server_port=7860,
show_error=True
)