Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,664 Bytes
def9f12 a7438d2 b1c0860 eeb2755 b1c0860 0910455 7dd6d04 a7438d2 b1c0860 a7438d2 b1c0860 a7438d2 91840f8 1e26e1c 91840f8 b1c0860 91840f8 744140b 91840f8 744140b 91840f8 a6b28a4 91840f8 b1c0860 91840f8 b1c0860 91840f8 a7438d2 91840f8 b1c0860 91840f8 4269cb2 a7438d2 b1c0860 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import torch
import gradio as gr
import spaces
# Create Gradio UI without loading models first
title = """<h1 align="center">AI Video Prompt Generator</h1>
<p align="center">Generate creative video prompts with technical specifications</p>
<p align="center">You can use prompts with Kling, MiniMax, Hunyuan, Haiper, CogVideoX, Luma, LTX, Runway, PixVerse. </p>"""
# Import these at global scope but don't instantiate yet
from vlm_captions import VLMCaptioning
from llm_inference_video import VideoLLMInferenceNode
# Global singleton instances - we'll initialize them only when needed
vlm_captioner = None
llm_node = None
# Initialize only once on first use
def get_vlm_captioner():
global vlm_captioner
if vlm_captioner is None:
print("Initializing Video Prompt Generator...")
vlm_captioner = VLMCaptioning()
print("Video Prompt Generator initialized successfully!")
return vlm_captioner
def get_llm_node():
global llm_node
if llm_node is None:
llm_node = VideoLLMInferenceNode()
return llm_node
# Wrapper functions that avoid passing the model between processes
@spaces.GPU()
def describe_image_wrapper(image, question="Describe this image in detail."):
"""GPU-decorated function for image description"""
if image is None:
return "Please upload an image."
if not question or question.strip() == "":
question = "Describe this image in detail."
# Get the captioner inside this GPU-decorated function
vlm = get_vlm_captioner()
return vlm.describe_image(image=image, question=question)
@spaces.GPU()
def describe_video_wrapper(video, frame_interval=30):
"""GPU-decorated function for video description"""
if video is None:
return "Please upload a video."
# Get the captioner inside this GPU-decorated function
vlm = get_vlm_captioner()
return vlm.describe_video(video_path=video, frame_interval=frame_interval)
def generate_video_prompt_wrapper(
concept, style, camera_style, camera_direction,
pacing, special_effects, custom_elements,
provider, model, prompt_length
):
"""Wrapper for LLM prompt generation"""
node = get_llm_node()
return node.generate_video_prompt(
concept, style, camera_style, camera_direction,
pacing, special_effects, custom_elements,
provider, model, prompt_length
)
def create_video_interface():
with gr.Blocks(theme='bethecloud/storj_theme') as demo:
gr.HTML(title)
with gr.Tab("Video Prompt Generator"):
with gr.Row():
with gr.Column(scale=1):
input_concept = gr.Textbox(label="Core Concept/Thematic Input", lines=3)
style = gr.Dropdown(
choices=["Minimalist", "Simple", "Detailed", "Descriptive", "Dynamic",
"Cinematic", "Documentary", "Animation", "Action", "Experimental"],
value="Simple",
label="Video Style"
)
custom_elements = gr.Textbox(label="Custom Technical Elements",
placeholder="e.g., Infrared hybrid, Datamosh transitions")
prompt_length = gr.Dropdown(
choices=["Short", "Medium", "Long"],
value="Medium",
label="Prompt Length"
)
with gr.Column(scale=1):
camera_direction = gr.Dropdown(
choices=[
"None",
"Zoom in", "Zoom out", "Pan left", "Pan right",
"Tilt up", "Tilt down", "Orbital rotation",
"Push in", "Pull out", "Track forward", "Track backward",
"Spiral in", "Spiral out", "Arc movement",
"Diagonal traverse", "Vertical rise", "Vertical descent"
],
value="None",
label="Camera Direction"
)
camera_style = gr.Dropdown(
choices=[
"None",
"Steadicam flow", "Drone aerials", "Handheld urgency", "Crane elegance",
"Dolly precision", "VR 360", "Multi-angle rig", "Static tripod",
"Gimbal smoothness", "Slider motion", "Jib sweep", "POV immersion",
"Time-slice array", "Macro extreme", "Tilt-shift miniature",
"Snorricam character", "Whip pan dynamics", "Dutch angle tension",
"Underwater housing", "Periscope lens"
],
value="None",
label="Camera Movement Style"
)
pacing = gr.Dropdown(
choices=[
"None",
"Slow burn", "Rhythmic pulse", "Frantic energy", "Ebb and flow",
"Hypnotic drift", "Time-lapse rush", "Stop-motion staccato",
"Gradual build", "Quick cut rhythm", "Long take meditation",
"Jump cut energy", "Match cut flow", "Cross-dissolve dreamscape",
"Parallel action", "Slow motion impact", "Ramping dynamics",
"Montage tempo", "Continuous flow", "Episodic breaks"
],
value="None",
label="Pacing Rhythm"
)
special_effects = gr.Dropdown(
choices=[
"None",
"Practical effects", "CGI enhancement", "Analog glitches",
"Light painting", "Projection mapping", "Nanosecond exposures",
"Double exposure", "Smoke diffusion", "Lens flare artistry",
"Particle systems", "Holographic overlay", "Chromatic aberration",
"Digital distortion", "Wire removal", "Motion capture",
"Miniature integration", "Weather simulation", "Color grading",
"Mixed media composite", "Neural style transfer"
],
value="None",
label="SFX Approach"
)
with gr.Column(scale=1):
provider = gr.Dropdown(
choices=["SambaNova", "Groq"],
value="SambaNova",
label="LLM Provider"
)
model = gr.Dropdown(
choices=[
"Meta-Llama-3.1-70B-Instruct",
"Meta-Llama-3.1-405B-Instruct",
"Meta-Llama-3.1-8B-Instruct"
],
value="Meta-Llama-3.1-70B-Instruct",
label="Model"
)
generate_btn = gr.Button("Generate Video Prompt", variant="primary")
output = gr.Textbox(label="Generated Prompt", lines=12, show_copy_button=True)
def update_models(provider):
models = {
"Groq": ["llama-3.3-70b-versatile"],
"SambaNova": [
"Meta-Llama-3.1-70B-Instruct",
"Meta-Llama-3.1-405B-Instruct",
"Meta-Llama-3.1-8B-Instruct"
]
}
return gr.Dropdown(choices=models[provider], value=models[provider][0])
provider.change(update_models, inputs=provider, outputs=model)
generate_btn.click(
generate_video_prompt_wrapper,
inputs=[input_concept, style, camera_style, camera_direction, pacing, special_effects,
custom_elements, provider, model, prompt_length],
outputs=output
)
with gr.Tab("Visual Analysis"):
with gr.Row():
with gr.Column():
image_input = gr.Image(label="Upload Image", type="filepath")
image_question = gr.Textbox(
label="Question (optional)",
placeholder="What is in this image?"
)
analyze_image_btn = gr.Button("Analyze Image")
image_output = gr.Textbox(label="Analysis Result", lines=5)
with gr.Column():
video_input = gr.Video(label="Upload Video")
analyze_video_btn = gr.Button("Analyze Video")
video_output = gr.Textbox(label="Video Analysis", lines=10)
# Use GPU-decorated wrapper functions directly
analyze_image_btn.click(
describe_image_wrapper,
inputs=[image_input, image_question],
outputs=image_output
)
analyze_video_btn.click(
describe_video_wrapper,
inputs=video_input,
outputs=video_output
)
return demo
if __name__ == "__main__":
demo = create_video_interface()
# Don't use share=True on Hugging Face Spaces
demo.launch() |