gokaygokay commited on
Commit
e25da44
·
1 Parent(s): 2022eac
Files changed (4) hide show
  1. app.py +45 -214
  2. llm_inference_video.py +0 -351
  3. requirements.txt +8 -17
  4. vlm_captions.py +0 -129
app.py CHANGED
@@ -1,222 +1,53 @@
 
1
  import torch
2
  import gradio as gr
3
- import spaces
 
4
 
5
- # Create Gradio UI without loading models first
6
- title = """<h1 align="center">AI Video Prompt Generator</h1>
7
- <p align="center">Generate creative video prompts with technical specifications</p>
8
- <p align="center">You can use prompts with Kling, MiniMax, Hunyuan, Haiper, CogVideoX, Luma, LTX, Runway, PixVerse. </p>"""
9
-
10
- # Import these at global scope but don't instantiate yet
11
- from vlm_captions import VLMCaptioning
12
- from llm_inference_video import VideoLLMInferenceNode
13
-
14
- # Global singleton instances - we'll initialize them only when needed
15
- vlm_captioner = None
16
- llm_node = None
17
-
18
- # Initialize only once on first use
19
- def get_vlm_captioner():
20
- global vlm_captioner
21
- if vlm_captioner is None:
22
- print("Initializing Video Prompt Generator...")
23
- vlm_captioner = VLMCaptioning()
24
- print("Video Prompt Generator initialized successfully!")
25
- return vlm_captioner
26
-
27
- def get_llm_node():
28
- global llm_node
29
- if llm_node is None:
30
- llm_node = VideoLLMInferenceNode()
31
- return llm_node
32
-
33
- # Wrapper functions that avoid passing the model between processes
34
- @spaces.GPU()
35
- def describe_image_wrapper(image, question="Describe this image in detail."):
36
- """GPU-decorated function for image description"""
37
- if image is None:
38
- return "Please upload an image."
39
 
40
- if not question or question.strip() == "":
41
- question = "Describe this image in detail."
 
42
 
43
- # Get the captioner inside this GPU-decorated function
44
- vlm = get_vlm_captioner()
45
- return vlm.describe_image(image=image, question=question)
46
-
47
- @spaces.GPU()
48
- def describe_video_wrapper(video, frame_interval=30):
49
- """GPU-decorated function for video description"""
50
- if video is None:
51
- return "Please upload a video."
52
 
53
- # Get the captioner inside this GPU-decorated function
54
- vlm = get_vlm_captioner()
55
- return vlm.describe_video(video_path=video, frame_interval=frame_interval)
56
-
57
- def generate_video_prompt_wrapper(
58
- concept, style, camera_style, camera_direction,
59
- pacing, special_effects, custom_elements,
60
- provider, model, prompt_length
61
- ):
62
- """Wrapper for LLM prompt generation"""
63
- node = get_llm_node()
64
- return node.generate_video_prompt(
65
- concept, style, camera_style, camera_direction,
66
- pacing, special_effects, custom_elements,
67
- provider, model, prompt_length
68
- )
69
-
70
- def create_video_interface():
71
- with gr.Blocks(theme='bethecloud/storj_theme') as demo:
72
- gr.HTML(title)
73
-
74
- with gr.Tab("Video Prompt Generator"):
75
- with gr.Row():
76
- with gr.Column(scale=1):
77
- input_concept = gr.Textbox(label="Core Concept/Thematic Input", lines=3)
78
- style = gr.Dropdown(
79
- choices=["Minimalist", "Simple", "Detailed", "Descriptive", "Dynamic",
80
- "Cinematic", "Documentary", "Animation", "Action", "Experimental"],
81
- value="Simple",
82
- label="Video Style"
83
- )
84
- custom_elements = gr.Textbox(label="Custom Technical Elements",
85
- placeholder="e.g., Infrared hybrid, Datamosh transitions")
86
- prompt_length = gr.Dropdown(
87
- choices=["Short", "Medium", "Long"],
88
- value="Medium",
89
- label="Prompt Length"
90
- )
91
-
92
- with gr.Column(scale=1):
93
- camera_direction = gr.Dropdown(
94
- choices=[
95
- "None",
96
- "Zoom in", "Zoom out", "Pan left", "Pan right",
97
- "Tilt up", "Tilt down", "Orbital rotation",
98
- "Push in", "Pull out", "Track forward", "Track backward",
99
- "Spiral in", "Spiral out", "Arc movement",
100
- "Diagonal traverse", "Vertical rise", "Vertical descent"
101
- ],
102
- value="None",
103
- label="Camera Direction"
104
- )
105
-
106
- camera_style = gr.Dropdown(
107
- choices=[
108
- "None",
109
- "Steadicam flow", "Drone aerials", "Handheld urgency", "Crane elegance",
110
- "Dolly precision", "VR 360", "Multi-angle rig", "Static tripod",
111
- "Gimbal smoothness", "Slider motion", "Jib sweep", "POV immersion",
112
- "Time-slice array", "Macro extreme", "Tilt-shift miniature",
113
- "Snorricam character", "Whip pan dynamics", "Dutch angle tension",
114
- "Underwater housing", "Periscope lens"
115
- ],
116
- value="None",
117
- label="Camera Movement Style"
118
- )
119
-
120
- pacing = gr.Dropdown(
121
- choices=[
122
- "None",
123
- "Slow burn", "Rhythmic pulse", "Frantic energy", "Ebb and flow",
124
- "Hypnotic drift", "Time-lapse rush", "Stop-motion staccato",
125
- "Gradual build", "Quick cut rhythm", "Long take meditation",
126
- "Jump cut energy", "Match cut flow", "Cross-dissolve dreamscape",
127
- "Parallel action", "Slow motion impact", "Ramping dynamics",
128
- "Montage tempo", "Continuous flow", "Episodic breaks"
129
- ],
130
- value="None",
131
- label="Pacing Rhythm"
132
- )
133
- special_effects = gr.Dropdown(
134
- choices=[
135
- "None",
136
- "Practical effects", "CGI enhancement", "Analog glitches",
137
- "Light painting", "Projection mapping", "Nanosecond exposures",
138
- "Double exposure", "Smoke diffusion", "Lens flare artistry",
139
- "Particle systems", "Holographic overlay", "Chromatic aberration",
140
- "Digital distortion", "Wire removal", "Motion capture",
141
- "Miniature integration", "Weather simulation", "Color grading",
142
- "Mixed media composite", "Neural style transfer"
143
- ],
144
- value="None",
145
- label="SFX Approach"
146
- )
147
-
148
- with gr.Column(scale=1):
149
- provider = gr.Dropdown(
150
- choices=["SambaNova", "Groq"],
151
- value="SambaNova",
152
- label="LLM Provider"
153
- )
154
- model = gr.Dropdown(
155
- choices=[
156
- "Meta-Llama-3.1-70B-Instruct",
157
- "Meta-Llama-3.1-405B-Instruct",
158
- "Meta-Llama-3.1-8B-Instruct"
159
- ],
160
- value="Meta-Llama-3.1-70B-Instruct",
161
- label="Model"
162
- )
163
-
164
- generate_btn = gr.Button("Generate Video Prompt", variant="primary")
165
- output = gr.Textbox(label="Generated Prompt", lines=12, show_copy_button=True)
166
-
167
- def update_models(provider):
168
- models = {
169
- "Groq": ["llama-3.3-70b-versatile"],
170
- "SambaNova": [
171
- "Meta-Llama-3.1-70B-Instruct",
172
- "Meta-Llama-3.1-405B-Instruct",
173
- "Meta-Llama-3.1-8B-Instruct"
174
- ]
175
- }
176
- return gr.Dropdown(choices=models[provider], value=models[provider][0])
177
-
178
- provider.change(update_models, inputs=provider, outputs=model)
179
-
180
- generate_btn.click(
181
- generate_video_prompt_wrapper,
182
- inputs=[input_concept, style, camera_style, camera_direction, pacing, special_effects,
183
- custom_elements, provider, model, prompt_length],
184
- outputs=output
185
- )
186
-
187
- with gr.Tab("Visual Analysis"):
188
- with gr.Row():
189
- with gr.Column():
190
- image_input = gr.Image(label="Upload Image", type="filepath")
191
- image_question = gr.Textbox(
192
- label="Question (optional)",
193
- placeholder="What is in this image?"
194
- )
195
-
196
- analyze_image_btn = gr.Button("Analyze Image")
197
- image_output = gr.Textbox(label="Analysis Result", lines=5)
198
-
199
- with gr.Column():
200
- video_input = gr.Video(label="Upload Video")
201
- analyze_video_btn = gr.Button("Analyze Video")
202
- video_output = gr.Textbox(label="Video Analysis", lines=10)
203
-
204
- # Use GPU-decorated wrapper functions directly
205
- analyze_image_btn.click(
206
- describe_image_wrapper,
207
- inputs=[image_input, image_question],
208
- outputs=image_output
209
- )
210
-
211
- analyze_video_btn.click(
212
- describe_video_wrapper,
213
- inputs=video_input,
214
- outputs=video_output
215
- )
216
-
217
- return demo
218
 
219
  if __name__ == "__main__":
220
- demo = create_video_interface()
221
- # Don't use share=True on Hugging Face Spaces
222
- demo.launch()
 
1
+ import os
2
  import torch
3
  import gradio as gr
4
+ from diffusers import FluxTransformer2DModel, FluxPipeline, BitsAndBytesConfig
5
+ from transformers import T5EncoderModel, BitsAndBytesConfig as BitsAndBytesConfigTF
6
 
7
+ def generate_image(prompt, negative_prompt="", num_inference_steps=30, guidance_scale=7.5):
8
+ # Initialize Flux pipeline
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ dtype = torch.bfloat16
12
+ single_file_base_model = "camenduru/FLUX.1-dev-diffusers"
13
+ file_url = "https://huggingface.co/lodestones/Chroma/resolve/main/chroma-unlocked-v31.safetensors"
14
 
15
+ quantization_config_tf = BitsAndBytesConfigTF(load_in_8bit=True, bnb_8bit_compute_dtype=torch.bfloat16)
16
+ text_encoder_2 = T5EncoderModel.from_pretrained(single_file_base_model, subfolder="text_encoder_2", torch_dtype=dtype, config=single_file_base_model, quantization_config=quantization_config_tf)
 
 
 
 
 
 
 
17
 
18
+ quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
19
+ transformer = FluxTransformer2DModel.from_single_file(file_url, subfolder="transformer", torch_dtype=dtype, config=single_file_base_model, quantization_config=quantization_config)
20
+
21
+ flux_pipeline = FluxPipeline.from_pretrained(single_file_base_model, transformer=transformer, text_encoder_2=text_encoder_2, torch_dtype=dtype)
22
+ flux_pipeline.to(device)
23
+
24
+ # Generate image
25
+ image = flux_pipeline(
26
+ prompt=prompt,
27
+ negative_prompt=negative_prompt,
28
+ num_inference_steps=num_inference_steps,
29
+ guidance_scale=guidance_scale
30
+ ).images[0]
31
+
32
+ return image
33
+
34
+ # Create Gradio interface
35
+ iface = gr.Interface(
36
+ fn=generate_image,
37
+ inputs=[
38
+ gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
39
+ gr.Textbox(label="Negative Prompt", placeholder="Enter negative prompt here...", value=""),
40
+ gr.Slider(minimum=1, maximum=100, value=30, step=1, label="Number of Inference Steps"),
41
+ gr.Slider(minimum=1.0, maximum=20.0, value=7.5, step=0.1, label="Guidance Scale")
42
+ ],
43
+ outputs=gr.Image(label="Generated Image"),
44
+ title="Chroma Image Generator",
45
+ description="Generate images using the Chroma model with FLUX pipeline",
46
+ examples=[
47
+ ["A beautiful sunset over mountains, photorealistic, 8k", "blurry, low quality, distorted", 30, 7.5],
48
+ ["A futuristic cityscape at night, neon lights, cyberpunk style", "ugly, deformed, low resolution", 30, 7.5]
49
+ ]
50
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  if __name__ == "__main__":
53
+ iface.launch()
 
 
llm_inference_video.py DELETED
@@ -1,351 +0,0 @@
1
- import os
2
- import time
3
- import requests
4
- from typing import Optional, Dict, Any, List
5
- import json
6
- import tempfile
7
- from PIL import Image
8
- from groq import Groq
9
- from openai import OpenAI
10
- import spaces
11
-
12
- class VideoLLMInferenceNode:
13
- def __init__(self):
14
- """
15
- Initialize the VideoLLMInferenceNode without VLM captioning dependency
16
- """
17
- self.sambanova_api_key = os.environ.get("SAMBANOVA_API_KEY", "")
18
- self.groq_api_key = os.environ.get("GROQ_API_KEY", "")
19
-
20
- # Initialize API clients if keys are available
21
- if self.groq_api_key:
22
- self.groq_client = Groq(api_key=self.groq_api_key)
23
- else:
24
- self.groq_client = None
25
-
26
- if self.sambanova_api_key:
27
- self.sambanova_client = OpenAI(
28
- api_key=self.sambanova_api_key,
29
- base_url="https://api.sambanova.ai/v1",
30
- )
31
- else:
32
- self.sambanova_client = None
33
-
34
- @spaces.GPU()
35
- def analyze_image(self, image_path: str, question: Optional[str] = None) -> str:
36
- """
37
- Analyze an image using VLM model directly
38
-
39
- Args:
40
- image_path: Path to the image file
41
- question: Optional question to ask about the image
42
-
43
- Returns:
44
- str: Analysis result
45
- """
46
- if not image_path:
47
- return "Please upload an image."
48
-
49
- if not question or question.strip() == "":
50
- question = "Describe this image in detail."
51
-
52
- try:
53
- # Import and use VLMCaptioning within this GPU-scoped function
54
- from app import get_vlm_captioner
55
- vlm = get_vlm_captioner()
56
- return vlm.describe_image(image_path, question)
57
- except Exception as e:
58
- return f"Error analyzing image: {str(e)}"
59
-
60
- @spaces.GPU()
61
- def analyze_video(self, video_path: str) -> str:
62
- """
63
- Analyze a video using VLM model directly
64
-
65
- Args:
66
- video_path: Path to the video file
67
-
68
- Returns:
69
- str: Analysis result
70
- """
71
- if not video_path:
72
- return "Please upload a video."
73
-
74
- try:
75
- # Import and use VLMCaptioning within this GPU-scoped function
76
- from app import get_vlm_captioner
77
- vlm = get_vlm_captioner()
78
- return vlm.describe_video(video_path)
79
- except Exception as e:
80
- return f"Error analyzing video: {str(e)}"
81
-
82
- def generate_video_prompt(
83
- self,
84
- concept: str,
85
- style: str = "Simple",
86
- camera_style: str = "None",
87
- camera_direction: str = "None",
88
- pacing: str = "None",
89
- special_effects: str = "None",
90
- custom_elements: str = "",
91
- provider: str = "SambaNova",
92
- model: str = "Meta-Llama-3.1-70B-Instruct",
93
- prompt_length: str = "Medium",
94
- image_path: str = "",
95
- video_path: str = ""
96
- ) -> str:
97
- """
98
- Generate a video prompt using the specified LLM provider
99
-
100
- Args:
101
- concept: Core concept for the video
102
- style: Video style
103
- camera_style: Camera style
104
- camera_direction: Camera direction
105
- pacing: Pacing rhythm
106
- special_effects: Special effects approach
107
- custom_elements: Custom technical elements
108
- provider: LLM provider (SambaNova or Groq)
109
- model: Model name
110
- prompt_length: Desired prompt length
111
- image_path: Optional path to an image for VLM description
112
- video_path: Optional path to a video for VLM description
113
-
114
- Returns:
115
- str: Generated video prompt
116
- """
117
- if not concept:
118
- return "Please enter a concept for the video."
119
-
120
- try:
121
- # Get VLM descriptions if image or video paths are provided
122
- image_description = ""
123
- video_description = ""
124
-
125
- if image_path:
126
- try:
127
- image_description = self.analyze_image(image_path, "Describe this image in detail for a video creator.")
128
- print(f"Generated image description: {image_description}")
129
- except Exception as e:
130
- print(f"Error generating image description: {str(e)}")
131
-
132
- if video_path:
133
- try:
134
- video_description = self.analyze_video(video_path)
135
- print(f"Generated video description: {video_description}")
136
- except Exception as e:
137
- print(f"Error generating video description: {str(e)}")
138
-
139
- # Helper function to format optional elements
140
- def format_element(element, element_type):
141
- if element == "None" or not element:
142
- return ""
143
-
144
- element_prefixes = {
145
- "camera": "utilizing",
146
- "direction": "with",
147
- "pacing": "with",
148
- "effects": "incorporating"
149
- }
150
-
151
- return f" {element_prefixes.get(element_type, '')} {element}"
152
-
153
- # Format camera movement combination
154
- camera_movement = ""
155
- if camera_style != "None" and camera_direction != "None":
156
- camera_movement = f"{camera_style} {camera_direction}"
157
- elif camera_style != "None":
158
- camera_movement = camera_style
159
- elif camera_direction != "None":
160
- camera_movement = camera_direction
161
-
162
- # Video prompt templates
163
- default_style = "simple" # Changed from "cinematic" to "simple" as default
164
-
165
- prompt_templates = {
166
- "minimalist": f"""Create an elegantly sparse video description focusing on {concept}.
167
- {format_element(camera_movement, 'camera')}
168
- {format_element(pacing, 'pacing')}
169
- {format_element(special_effects, 'effects')}
170
- {' with ' + custom_elements if custom_elements else ''}.""",
171
-
172
- "dynamic": f"""Craft an energetic, fast-paced paragraph showcasing {concept} in constant motion. Utilize bold {camera_style} movements and {pacing} rhythm to create momentum. Layer {special_effects} effects and {custom_elements if custom_elements else 'powerful visual elements'} to maintain high energy throughout.""",
173
-
174
- "simple": f"""Create a straightforward, easy-to-understand paragraph describing a video about {concept}. Use {camera_style} camera work and {pacing} pacing. Keep the visuals clear and uncomplicated, incorporating {special_effects} effects and {custom_elements if custom_elements else 'basic visual elements'} in an accessible way.""",
175
-
176
- "detailed": f"""Construct a meticulous, technically precise paragraph outlining a video about {concept}. Incorporate specific details about {camera_style} cinematography, {pacing} timing, and {special_effects} effects. Include {custom_elements if custom_elements else 'precise technical elements'} while maintaining clarity and depth.""",
177
-
178
- "descriptive": f"""Write a richly descriptive paragraph for a video exploring {concept}. Paint a vivid picture using sensory details, incorporating {camera_style} movement, {pacing} flow, and {special_effects} effects. Emphasize texture, color, and atmosphere, enhanced by {custom_elements if custom_elements else 'evocative visual elements'}.""",
179
-
180
- "cinematic": f"""Create a single, detailed paragraph describing a cinematic video that captures {concept}. Focus on creating a cohesive narrative that incorporates {style} visual aesthetics, {camera_style} camera work, {pacing} pacing, and {special_effects} effects. Include atmospheric elements like {custom_elements if custom_elements else 'mood lighting and environmental details'} to enhance the storytelling. Describe the visual journey without technical timestamps or shot lists.""",
181
-
182
- "documentary": f"""Write a comprehensive paragraph for a documentary-style video exploring {concept}. Blend observational footage with {camera_style} cinematography, incorporating {pacing} editorial rhythm and {special_effects} visual treatments. Focus on creating an immersive narrative that educates and engages, enhanced by {custom_elements if custom_elements else 'authentic moments and natural lighting'}.""",
183
-
184
- "animation": f"""Compose a vivid paragraph describing a {style} animated video showcasing {concept}. Detail the unique visual style, character movements, and world-building elements, incorporating {camera_style} perspectives and {pacing} story flow. Include {special_effects} animation effects and {custom_elements if custom_elements else 'signature artistic elements'} to create a memorable visual experience.""",
185
-
186
- "action": f"""Craft an energetic paragraph describing an action sequence centered on {concept}. Emphasize the dynamic flow of action using {camera_style} cinematography, {pacing} rhythm, and {special_effects} visual effects. Incorporate {style} stylistic choices and {custom_elements if custom_elements else 'impactful moments'} to create an adrenaline-pumping experience.""",
187
-
188
- "experimental": f"""Create an avant-garde paragraph describing an experimental video exploring {concept}. Embrace unconventional storytelling through {style} aesthetics, {camera_style} techniques, and {pacing} temporal flow. Incorporate {special_effects} digital manipulations and {custom_elements if custom_elements else 'abstract visual metaphors'} to challenge traditional narrative structures."""
189
- }
190
-
191
- # Get the template with a more neutral default
192
- selected_style = style.lower()
193
- if selected_style not in prompt_templates:
194
- print(f"Warning: Style '{style}' not found, using '{default_style}' template")
195
- selected_style = default_style
196
-
197
- base_prompt = prompt_templates[selected_style]
198
-
199
- # Configure length requirements
200
- length_config = {
201
- "Short": {
202
- "guidance": "Create exactly very short, ONE impactful sentence that captures the essence of the video. Be concise but descriptive.",
203
- "structure": "Combine all elements into a single, powerful sentence."
204
- },
205
- "Medium": {
206
- "guidance": "Create 2-3 flowing sentences that paint a picture of the video.",
207
- "structure": "First sentence should set the scene, followed by 1-2 sentences developing the concept."
208
- },
209
- "Long": {
210
- "guidance": "Create 4-5 detailed sentences that thoroughly describe the video.",
211
- "structure": "Begin with the setting, develop the action/movement, and conclude with impact."
212
- }
213
- }
214
-
215
- config = length_config[prompt_length]
216
-
217
- system_message = f"""You are a visionary video director and creative storyteller. {config['guidance']}
218
-
219
- Structure: {config['structure']}
220
-
221
- Focus on these elements while maintaining the specified sentence count:
222
- 1. Visual atmosphere and mood
223
- 2. Camera movement and cinematography
224
- 3. Narrative flow
225
- 4. Style and aesthetic choices
226
- 5. Key moments
227
- 6. Emotional impact
228
- {'' if not image_description and not video_description else '7. Elements from the provided image/video descriptions'}
229
-
230
- {'' if not image_description and not video_description else 'If image or video descriptions are provided, incorporate their key visual elements and content into your description to ensure accuracy and relevance.'}
231
-
232
- IMPORTANT REQUIREMENTS:
233
- - Deliver exactly the specified number of sentences
234
- - Short: ONE sentence
235
- - Medium: TWO to THREE sentences
236
- - Long: FOUR to FIVE sentences
237
- - If camera movements are specified, you MUST incorporate them into the description
238
- - Keep everything in a single paragraph format
239
- - Avoid technical specifications or shot lists
240
- - Avoid talking about 'video' or 'videos'. Do not start with 'The video opens with...' or 'The video starts with...' and do not include 'in this video' or 'focus of this video'. kind of terms"""
241
-
242
- # Format the user prompt with style guidance and camera movement
243
- user_message = f"""Style Guide: {selected_style.capitalize()} Style
244
- {prompt_templates[selected_style]}
245
-
246
- Camera Movement: {camera_movement if camera_movement else 'No specific camera movement'}
247
- Core Concept: {concept}
248
- {f'Reference Image Description: {image_description}' if image_description else ''}
249
- {f'Reference Video Description: {video_description}' if video_description else ''}
250
-
251
- Please create a {prompt_length.lower()}-length description incorporating these elements into a cohesive narrative.
252
- {'' if not image_description and not video_description else 'Use the provided image/video descriptions as reference to inform your prompt creation.'}
253
- Avoid talking about 'video' or 'videos'. Do not start with 'The video opens with...' or 'The video starts with...' and do not include 'in this video' or 'focus of this video'. kind of terms. Do not say "Here is your video prompt" or "Here is your video description" or anything like that. Just give the prompt."""
254
-
255
- # Call the appropriate API based on provider
256
- if provider == "SambaNova":
257
- if self.sambanova_client:
258
- return self._call_sambanova_client(system_message, user_message, model)
259
- else:
260
- return self._call_sambanova_api(system_message, user_message, model)
261
- elif provider == "Groq":
262
- if self.groq_client:
263
- return self._call_groq_client(system_message, user_message, model)
264
- else:
265
- return self._call_groq_api(system_message, user_message, model)
266
- else:
267
- return "Unsupported provider. Please select SambaNova or Groq."
268
- except Exception as e:
269
- return f"Error generating prompt: {str(e)}"
270
-
271
- def _call_sambanova_client(self, system_message: str, user_message: str, model: str) -> str:
272
- """Call the SambaNova API using the client library"""
273
- try:
274
- chat_completion = self.sambanova_client.chat.completions.create(
275
- model=model,
276
- messages=[
277
- {"role": "system", "content": system_message},
278
- {"role": "user", "content": user_message}
279
- ]
280
- )
281
- return chat_completion.choices[0].message.content
282
- except Exception as e:
283
- return f"Error from SambaNova API: {str(e)}"
284
-
285
- def _call_sambanova_api(self, system_message: str, user_message: str, model: str) -> str:
286
- """Call the SambaNova API using direct HTTP requests"""
287
- if not self.sambanova_api_key:
288
- return "SambaNova API key not configured. Please set the SAMBANOVA_API_KEY environment variable."
289
-
290
- api_url = "https://api.sambanova.ai/api/v1/chat/completions"
291
- headers = {
292
- "Content-Type": "application/json",
293
- "Authorization": f"Bearer {self.sambanova_api_key}"
294
- }
295
-
296
- payload = {
297
- "model": model,
298
- "messages": [
299
- {"role": "system", "content": system_message},
300
- {"role": "user", "content": user_message}
301
- ]
302
- }
303
-
304
- response = requests.post(api_url, headers=headers, json=payload)
305
-
306
- if response.status_code == 200:
307
- result = response.json()
308
- return result.get("choices", [{}])[0].get("message", {}).get("content", "No content returned")
309
- else:
310
- return f"Error from SambaNova API: {response.status_code} - {response.text}"
311
-
312
- def _call_groq_client(self, system_message: str, user_message: str, model: str) -> str:
313
- """Call the Groq API using the client library"""
314
- try:
315
- chat_completion = self.groq_client.chat.completions.create(
316
- model=model,
317
- messages=[
318
- {"role": "system", "content": system_message},
319
- {"role": "user", "content": user_message}
320
- ]
321
- )
322
- return chat_completion.choices[0].message.content
323
- except Exception as e:
324
- return f"Error from Groq API: {str(e)}"
325
-
326
- def _call_groq_api(self, system_message: str, user_message: str, model: str) -> str:
327
- """Call the Groq API using direct HTTP requests"""
328
- if not self.groq_api_key:
329
- return "Groq API key not configured. Please set the GROQ_API_KEY environment variable."
330
-
331
- api_url = "https://api.groq.com/openai/v1/chat/completions"
332
- headers = {
333
- "Content-Type": "application/json",
334
- "Authorization": f"Bearer {self.groq_api_key}"
335
- }
336
-
337
- payload = {
338
- "model": model,
339
- "messages": [
340
- {"role": "system", "content": system_message},
341
- {"role": "user", "content": user_message}
342
- ]
343
- }
344
-
345
- response = requests.post(api_url, headers=headers, json=payload)
346
-
347
- if response.status_code == 200:
348
- result = response.json()
349
- return result.get("choices", [{}])[0].get("message", {}).get("content", "No content returned")
350
- else:
351
- return f"Error from Groq API: {response.status_code} - {response.text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,17 +1,8 @@
1
- openai
2
- groq
3
- numpy==1.26.4
4
- Pillow==10.1.0
5
- torch==2.1.2
6
- torchaudio==2.1.2
7
- torchvision==0.16.2
8
- transformers==4.44.2
9
- sentencepiece==0.1.99
10
- https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.2/flash_attn-2.6.2+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
11
- decord
12
- librosa==0.9.0
13
- soundfile==0.12.1
14
- vector-quantize-pytorch==1.18.5
15
- vocos==0.1.0
16
- moviepy
17
- gradio
 
1
+ accelerate
2
+ git+https://github.com/huggingface/diffusers.git
3
+ torch
4
+ gradio
5
+ transformers
6
+ xformers
7
+ sentencepiece
8
+ peft
 
 
 
 
 
 
 
 
 
vlm_captions.py DELETED
@@ -1,129 +0,0 @@
1
- import torch
2
- from PIL import Image
3
- from transformers import AutoModel, AutoTokenizer
4
- from decord import VideoReader, cpu
5
- import spaces
6
-
7
- class VLMCaptioning:
8
- def __init__(self):
9
- print("Loading MiniCPM-O model...")
10
- self.model = AutoModel.from_pretrained(
11
- 'openbmb/MiniCPM-o-2_6',
12
- trust_remote_code=True,
13
- attn_implementation='sdpa',
14
- torch_dtype=torch.bfloat16,
15
- init_vision=True,
16
- )
17
- self.model = self.model.eval().cuda()
18
- self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
19
-
20
- @spaces.GPU()
21
- def describe_image(
22
- self,
23
- image: str,
24
- question: str = "Describe this image in detail.",
25
- temperature: float = 0.7,
26
- top_p: float = 0.9,
27
- top_k: int = 40,
28
- max_new_tokens: int = 512,
29
- stream=False,
30
- sampling=False
31
- ) -> str:
32
- """
33
- Generate description for a single image
34
-
35
- Args:
36
- image (str): Path to image file
37
- question (str): Question to ask about the image
38
- temperature (float): Sampling temperature
39
- top_p (float): Nucleus sampling parameter
40
- top_k (int): Top-k sampling parameter
41
- max_new_tokens (int): Maximum new tokens to generate
42
-
43
- Returns:
44
- str: Generated description
45
- """
46
- try:
47
- if not image:
48
- return "Please provide an image."
49
-
50
- # Convert image to RGB
51
- image = Image.open(image).convert('RGB')
52
-
53
- # Prepare message
54
- msgs = [{'role': 'user', 'content': [image, question]}]
55
-
56
- # Generate response
57
- response = self.model.chat(
58
- image=None,
59
- msgs=msgs,
60
- tokenizer=self.tokenizer,
61
- temperature=temperature,
62
- top_p=top_p,
63
- top_k=top_k,
64
- max_new_tokens=max_new_tokens,
65
- stream=stream,
66
- sampling=sampling
67
- )
68
- return response
69
- except Exception as e:
70
- return f"Error analyzing image: {str(e)}"
71
-
72
- @spaces.GPU()
73
- def describe_video(
74
- self,
75
- video_path: str,
76
- frame_interval: int = 30,
77
- temperature: float = 0.7,
78
- top_p: float = 0.9,
79
- top_k: int = 40,
80
- max_new_tokens: int = 512,
81
- stream=False,
82
- sampling=False
83
- ) -> str:
84
- """
85
- Generate description for video frames
86
-
87
- Args:
88
- video_path (str): Path to video file
89
- frame_interval (int): Interval between frames to analyze
90
- temperature (float): Sampling temperature
91
- top_p (float): Nucleus sampling parameter
92
- top_k (int): Top-k sampling parameter
93
- max_new_tokens (int): Maximum new tokens to generate
94
-
95
- Returns:
96
- str: Generated description
97
- """
98
- try:
99
- # Load video and extract frames
100
- vr = VideoReader(video_path, ctx=cpu(0))
101
- total_frames = len(vr)
102
- frame_indices = list(range(0, total_frames, frame_interval))
103
- frames = vr.get_batch(frame_indices).asnumpy()
104
-
105
- # Convert frames to PIL Images
106
- frame_images = [Image.fromarray(frame) for frame in frames]
107
-
108
- # Prepare messages for all frames
109
- msgs = [
110
- {'role': 'user', 'content': [frame, "Describe the main action in this scene."]}
111
- for frame in frame_images
112
- ]
113
-
114
- # Generate response for all frames at once
115
- response = self.model.chat(
116
- image=None,
117
- msgs=msgs,
118
- tokenizer=self.tokenizer,
119
- temperature=temperature,
120
- top_p=top_p,
121
- top_k=top_k,
122
- max_new_tokens=max_new_tokens,
123
- stream=stream,
124
- sampling=sampling
125
- )
126
- return response
127
-
128
- except Exception as e:
129
- return f"Error processing video: {str(e)}"