File size: 9,664 Bytes
def9f12
a7438d2
b1c0860
eeb2755
b1c0860
0910455
7dd6d04
 
a7438d2
b1c0860
 
 
a7438d2
b1c0860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7438d2
 
 
91840f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e26e1c
 
 
91840f8
 
 
 
 
 
 
b1c0860
91840f8
 
 
 
 
 
 
 
744140b
91840f8
 
 
 
744140b
91840f8
 
 
 
a6b28a4
91840f8
 
 
b1c0860
91840f8
b1c0860
91840f8
 
 
a7438d2
91840f8
b1c0860
91840f8
 
 
4269cb2
a7438d2
 
 
 
b1c0860
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import torch
import gradio as gr
import spaces

# Create Gradio UI without loading models first
title = """<h1 align="center">AI Video Prompt Generator</h1>
<p align="center">Generate creative video prompts with technical specifications</p>
<p align="center">You can use prompts with Kling, MiniMax, Hunyuan, Haiper, CogVideoX, Luma, LTX, Runway, PixVerse. </p>"""

# Import these at global scope but don't instantiate yet
from vlm_captions import VLMCaptioning
from llm_inference_video import VideoLLMInferenceNode

# Global singleton instances - we'll initialize them only when needed
vlm_captioner = None
llm_node = None

# Initialize only once on first use
def get_vlm_captioner():
    global vlm_captioner
    if vlm_captioner is None:
        print("Initializing Video Prompt Generator...")
        vlm_captioner = VLMCaptioning()
        print("Video Prompt Generator initialized successfully!")
    return vlm_captioner

def get_llm_node():
    global llm_node
    if llm_node is None:
        llm_node = VideoLLMInferenceNode()
    return llm_node

# Wrapper functions that avoid passing the model between processes
@spaces.GPU()
def describe_image_wrapper(image, question="Describe this image in detail."):
    """GPU-decorated function for image description"""
    if image is None:
        return "Please upload an image."
    
    if not question or question.strip() == "":
        question = "Describe this image in detail."
    
    # Get the captioner inside this GPU-decorated function
    vlm = get_vlm_captioner()
    return vlm.describe_image(image=image, question=question)

@spaces.GPU()
def describe_video_wrapper(video, frame_interval=30):
    """GPU-decorated function for video description"""
    if video is None:
        return "Please upload a video."
    
    # Get the captioner inside this GPU-decorated function
    vlm = get_vlm_captioner()
    return vlm.describe_video(video_path=video, frame_interval=frame_interval)

def generate_video_prompt_wrapper(
    concept, style, camera_style, camera_direction, 
    pacing, special_effects, custom_elements, 
    provider, model, prompt_length
):
    """Wrapper for LLM prompt generation"""
    node = get_llm_node()
    return node.generate_video_prompt(
        concept, style, camera_style, camera_direction,
        pacing, special_effects, custom_elements,
        provider, model, prompt_length
    )

def create_video_interface():
    with gr.Blocks(theme='bethecloud/storj_theme') as demo:
        gr.HTML(title)
        
        with gr.Tab("Video Prompt Generator"):
            with gr.Row():
                with gr.Column(scale=1):
                    input_concept = gr.Textbox(label="Core Concept/Thematic Input", lines=3)
                    style = gr.Dropdown(
                        choices=["Minimalist", "Simple", "Detailed", "Descriptive", "Dynamic", 
                                "Cinematic", "Documentary", "Animation", "Action", "Experimental"],
                        value="Simple",
                        label="Video Style"
                    )
                    custom_elements = gr.Textbox(label="Custom Technical Elements", 
                                               placeholder="e.g., Infrared hybrid, Datamosh transitions")
                    prompt_length = gr.Dropdown(
                        choices=["Short", "Medium", "Long"],
                        value="Medium",
                        label="Prompt Length"
                    ) 
                    
                with gr.Column(scale=1):
                    camera_direction = gr.Dropdown(
                        choices=[
                            "None",
                            "Zoom in", "Zoom out", "Pan left", "Pan right",
                            "Tilt up", "Tilt down", "Orbital rotation",
                            "Push in", "Pull out", "Track forward", "Track backward",
                            "Spiral in", "Spiral out", "Arc movement",
                            "Diagonal traverse", "Vertical rise", "Vertical descent"
                        ],
                        value="None",
                        label="Camera Direction"
                    )

                    camera_style = gr.Dropdown(
                        choices=[
                            "None",
                            "Steadicam flow", "Drone aerials", "Handheld urgency", "Crane elegance",
                            "Dolly precision", "VR 360", "Multi-angle rig", "Static tripod",
                            "Gimbal smoothness", "Slider motion", "Jib sweep", "POV immersion",
                            "Time-slice array", "Macro extreme", "Tilt-shift miniature",
                            "Snorricam character", "Whip pan dynamics", "Dutch angle tension",
                            "Underwater housing", "Periscope lens"
                        ],
                        value="None",
                        label="Camera Movement Style"
                    )

                    pacing = gr.Dropdown(
                        choices=[
                            "None",
                            "Slow burn", "Rhythmic pulse", "Frantic energy", "Ebb and flow",
                            "Hypnotic drift", "Time-lapse rush", "Stop-motion staccato",
                            "Gradual build", "Quick cut rhythm", "Long take meditation",
                            "Jump cut energy", "Match cut flow", "Cross-dissolve dreamscape",
                            "Parallel action", "Slow motion impact", "Ramping dynamics",
                            "Montage tempo", "Continuous flow", "Episodic breaks"
                        ],
                        value="None",
                        label="Pacing Rhythm"
                    )
                    special_effects = gr.Dropdown(
                        choices=[
                            "None",
                            "Practical effects", "CGI enhancement", "Analog glitches",
                            "Light painting", "Projection mapping", "Nanosecond exposures",
                            "Double exposure", "Smoke diffusion", "Lens flare artistry",
                            "Particle systems", "Holographic overlay", "Chromatic aberration",
                            "Digital distortion", "Wire removal", "Motion capture",
                            "Miniature integration", "Weather simulation", "Color grading",
                            "Mixed media composite", "Neural style transfer"
                        ],
                        value="None",
                        label="SFX Approach"
                    )
                    
                with gr.Column(scale=1):
                    provider = gr.Dropdown(
                        choices=["SambaNova", "Groq"],
                        value="SambaNova",
                        label="LLM Provider"
                    )
                    model = gr.Dropdown(
                        choices=[
                            "Meta-Llama-3.1-70B-Instruct",
                            "Meta-Llama-3.1-405B-Instruct",
                            "Meta-Llama-3.1-8B-Instruct"
                        ],
                        value="Meta-Llama-3.1-70B-Instruct",
                        label="Model"
                    )
                    
                    generate_btn = gr.Button("Generate Video Prompt", variant="primary")
                    output = gr.Textbox(label="Generated Prompt", lines=12, show_copy_button=True)

            def update_models(provider):
                models = {
                    "Groq": ["llama-3.3-70b-versatile"],
                    "SambaNova": [
                        "Meta-Llama-3.1-70B-Instruct",
                        "Meta-Llama-3.1-405B-Instruct",
                        "Meta-Llama-3.1-8B-Instruct"
                    ]
                }
                return gr.Dropdown(choices=models[provider], value=models[provider][0])

            provider.change(update_models, inputs=provider, outputs=model)

            generate_btn.click(
                generate_video_prompt_wrapper,
                inputs=[input_concept, style, camera_style, camera_direction, pacing, special_effects, 
                       custom_elements, provider, model, prompt_length],
                outputs=output
            )

        with gr.Tab("Visual Analysis"):
            with gr.Row():
                with gr.Column():
                    image_input = gr.Image(label="Upload Image", type="filepath")
                    image_question = gr.Textbox(
                        label="Question (optional)", 
                        placeholder="What is in this image?"
                    )

                    analyze_image_btn = gr.Button("Analyze Image")
                    image_output = gr.Textbox(label="Analysis Result", lines=5)

                with gr.Column():
                    video_input = gr.Video(label="Upload Video")
                    analyze_video_btn = gr.Button("Analyze Video")
                    video_output = gr.Textbox(label="Video Analysis", lines=10)

            # Use GPU-decorated wrapper functions directly
            analyze_image_btn.click(
                describe_image_wrapper,
                inputs=[image_input, image_question],
                outputs=image_output
            )

            analyze_video_btn.click(
                describe_video_wrapper,
                inputs=video_input,
                outputs=video_output
            )

    return demo

if __name__ == "__main__":
    demo = create_video_interface()
    # Don't use share=True on Hugging Face Spaces
    demo.launch()