gokaygokay commited on
Commit
91840f8
·
1 Parent(s): 13fa156

understanding

Browse files
Files changed (3) hide show
  1. app.py +137 -109
  2. llm_inference_video.py +12 -1
  3. vlm_captions.py +64 -0
app.py CHANGED
@@ -17,119 +17,147 @@ def create_video_interface():
17
  with gr.Blocks(theme='bethecloud/storj_theme') as demo:
18
  gr.HTML(title)
19
 
20
- with gr.Row():
21
- with gr.Column(scale=1):
22
- input_concept = gr.Textbox(label="Core Concept/Thematic Input", lines=3)
23
- style = gr.Dropdown(
24
- choices=["Minimalist", "Simple", "Detailed", "Descriptive", "Dynamic",
25
- "Cinematic", "Documentary", "Animation", "Action", "Experimental"],
26
- value="Simple",
27
- label="Video Style"
28
- )
29
- custom_elements = gr.Textbox(label="Custom Technical Elements",
30
- placeholder="e.g., Infrared hybrid, Datamosh transitions")
31
- prompt_length = gr.Dropdown(
32
- choices=["Short", "Medium", "Long"],
33
- value="Medium",
34
- label="Prompt Length"
35
- )
36
-
37
- with gr.Column(scale=1):
38
- camera_direction = gr.Dropdown(
39
- choices=[
40
- "None",
41
- "Zoom in", "Zoom out", "Pan left", "Pan right",
42
- "Tilt up", "Tilt down", "Orbital rotation",
43
- "Push in", "Pull out", "Track forward", "Track backward",
44
- "Spiral in", "Spiral out", "Arc movement",
45
- "Diagonal traverse", "Vertical rise", "Vertical descent"
46
- ],
47
- value="None",
48
- label="Camera Direction"
49
- )
50
-
51
- camera_style = gr.Dropdown(
52
- choices=[
53
- "None",
54
- "Steadicam flow", "Drone aerials", "Handheld urgency", "Crane elegance",
55
- "Dolly precision", "VR 360", "Multi-angle rig", "Static tripod",
56
- "Gimbal smoothness", "Slider motion", "Jib sweep", "POV immersion",
57
- "Time-slice array", "Macro extreme", "Tilt-shift miniature",
58
- "Snorricam character", "Whip pan dynamics", "Dutch angle tension",
59
- "Underwater housing", "Periscope lens"
60
- ],
61
- value="None",
62
- label="Camera Movement Style"
63
- )
64
-
65
- pacing = gr.Dropdown(
66
- choices=[
67
- "None",
68
- "Slow burn", "Rhythmic pulse", "Frantic energy", "Ebb and flow",
69
- "Hypnotic drift", "Time-lapse rush", "Stop-motion staccato",
70
- "Gradual build", "Quick cut rhythm", "Long take meditation",
71
- "Jump cut energy", "Match cut flow", "Cross-dissolve dreamscape",
72
- "Parallel action", "Slow motion impact", "Ramping dynamics",
73
- "Montage tempo", "Continuous flow", "Episodic breaks"
74
- ],
75
- value="None",
76
- label="Pacing Rhythm"
77
- )
78
- special_effects = gr.Dropdown(
79
- choices=[
80
- "None",
81
- "Practical effects", "CGI enhancement", "Analog glitches",
82
- "Light painting", "Projection mapping", "Nanosecond exposures",
83
- "Double exposure", "Smoke diffusion", "Lens flare artistry",
84
- "Particle systems", "Holographic overlay", "Chromatic aberration",
85
- "Digital distortion", "Wire removal", "Motion capture",
86
- "Miniature integration", "Weather simulation", "Color grading",
87
- "Mixed media composite", "Neural style transfer"
88
- ],
89
- value="None",
90
- label="SFX Approach"
91
- )
92
-
93
- with gr.Column(scale=1):
94
- provider = gr.Dropdown(
95
- choices=["SambaNova", "Groq"],
96
- value="SambaNova",
97
- label="LLM Provider"
98
- )
99
- model = gr.Dropdown(
100
- choices=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  "Meta-Llama-3.1-70B-Instruct",
102
  "Meta-Llama-3.1-405B-Instruct",
103
  "Meta-Llama-3.1-8B-Instruct"
104
- ],
105
- value="Meta-Llama-3.1-70B-Instruct",
106
- label="Model"
107
- )
108
-
109
-
110
- generate_btn = gr.Button("Generate Video Prompt", variant="primary")
111
- output = gr.Textbox(label="Generated Prompt", lines=12, show_copy_button=True)
112
-
113
- def update_models(provider):
114
- models = {
115
- "Groq": ["llama-3.3-70b-versatile"],
116
- "SambaNova": [
117
- "Meta-Llama-3.1-70B-Instruct",
118
- "Meta-Llama-3.1-405B-Instruct",
119
- "Meta-Llama-3.1-8B-Instruct"
120
- ]
121
- }
122
- return gr.Dropdown(choices=models[provider], value=models[provider][0])
123
-
124
- provider.change(update_models, inputs=provider, outputs=model)
125
-
126
- generate_btn.click(
127
- llm_node.generate_video_prompt,
128
- inputs=[input_concept, style, camera_style, camera_direction, pacing, special_effects,
129
- custom_elements, provider, model, prompt_length],
130
- outputs=output
131
- )
 
 
 
 
 
 
132
 
 
 
 
 
 
133
 
134
  return demo
135
 
 
17
  with gr.Blocks(theme='bethecloud/storj_theme') as demo:
18
  gr.HTML(title)
19
 
20
+ with gr.Tab("Video Prompt Generator"):
21
+ with gr.Row():
22
+ with gr.Column(scale=1):
23
+ input_concept = gr.Textbox(label="Core Concept/Thematic Input", lines=3)
24
+ style = gr.Dropdown(
25
+ choices=["Minimalist", "Simple", "Detailed", "Descriptive", "Dynamic",
26
+ "Cinematic", "Documentary", "Animation", "Action", "Experimental"],
27
+ value="Simple",
28
+ label="Video Style"
29
+ )
30
+ custom_elements = gr.Textbox(label="Custom Technical Elements",
31
+ placeholder="e.g., Infrared hybrid, Datamosh transitions")
32
+ prompt_length = gr.Dropdown(
33
+ choices=["Short", "Medium", "Long"],
34
+ value="Medium",
35
+ label="Prompt Length"
36
+ )
37
+
38
+ with gr.Column(scale=1):
39
+ camera_direction = gr.Dropdown(
40
+ choices=[
41
+ "None",
42
+ "Zoom in", "Zoom out", "Pan left", "Pan right",
43
+ "Tilt up", "Tilt down", "Orbital rotation",
44
+ "Push in", "Pull out", "Track forward", "Track backward",
45
+ "Spiral in", "Spiral out", "Arc movement",
46
+ "Diagonal traverse", "Vertical rise", "Vertical descent"
47
+ ],
48
+ value="None",
49
+ label="Camera Direction"
50
+ )
51
+
52
+ camera_style = gr.Dropdown(
53
+ choices=[
54
+ "None",
55
+ "Steadicam flow", "Drone aerials", "Handheld urgency", "Crane elegance",
56
+ "Dolly precision", "VR 360", "Multi-angle rig", "Static tripod",
57
+ "Gimbal smoothness", "Slider motion", "Jib sweep", "POV immersion",
58
+ "Time-slice array", "Macro extreme", "Tilt-shift miniature",
59
+ "Snorricam character", "Whip pan dynamics", "Dutch angle tension",
60
+ "Underwater housing", "Periscope lens"
61
+ ],
62
+ value="None",
63
+ label="Camera Movement Style"
64
+ )
65
+
66
+ pacing = gr.Dropdown(
67
+ choices=[
68
+ "None",
69
+ "Slow burn", "Rhythmic pulse", "Frantic energy", "Ebb and flow",
70
+ "Hypnotic drift", "Time-lapse rush", "Stop-motion staccato",
71
+ "Gradual build", "Quick cut rhythm", "Long take meditation",
72
+ "Jump cut energy", "Match cut flow", "Cross-dissolve dreamscape",
73
+ "Parallel action", "Slow motion impact", "Ramping dynamics",
74
+ "Montage tempo", "Continuous flow", "Episodic breaks"
75
+ ],
76
+ value="None",
77
+ label="Pacing Rhythm"
78
+ )
79
+ special_effects = gr.Dropdown(
80
+ choices=[
81
+ "None",
82
+ "Practical effects", "CGI enhancement", "Analog glitches",
83
+ "Light painting", "Projection mapping", "Nanosecond exposures",
84
+ "Double exposure", "Smoke diffusion", "Lens flare artistry",
85
+ "Particle systems", "Holographic overlay", "Chromatic aberration",
86
+ "Digital distortion", "Wire removal", "Motion capture",
87
+ "Miniature integration", "Weather simulation", "Color grading",
88
+ "Mixed media composite", "Neural style transfer"
89
+ ],
90
+ value="None",
91
+ label="SFX Approach"
92
+ )
93
+
94
+ with gr.Column(scale=1):
95
+ provider = gr.Dropdown(
96
+ choices=["SambaNova", "Groq"],
97
+ value="SambaNova",
98
+ label="LLM Provider"
99
+ )
100
+ model = gr.Dropdown(
101
+ choices=[
102
+ "Meta-Llama-3.1-70B-Instruct",
103
+ "Meta-Llama-3.1-405B-Instruct",
104
+ "Meta-Llama-3.1-8B-Instruct"
105
+ ],
106
+ value="Meta-Llama-3.1-70B-Instruct",
107
+ label="Model"
108
+ )
109
+
110
+
111
+ generate_btn = gr.Button("Generate Video Prompt", variant="primary")
112
+ output = gr.Textbox(label="Generated Prompt", lines=12, show_copy_button=True)
113
+
114
+ def update_models(provider):
115
+ models = {
116
+ "Groq": ["llama-3.3-70b-versatile"],
117
+ "SambaNova": [
118
  "Meta-Llama-3.1-70B-Instruct",
119
  "Meta-Llama-3.1-405B-Instruct",
120
  "Meta-Llama-3.1-8B-Instruct"
121
+ ]
122
+ }
123
+ return gr.Dropdown(choices=models[provider], value=models[provider][0])
124
+
125
+ provider.change(update_models, inputs=provider, outputs=model)
126
+
127
+ generate_btn.click(
128
+ llm_node.generate_video_prompt,
129
+ inputs=[input_concept, style, camera_style, camera_direction, pacing, special_effects,
130
+ custom_elements, provider, model, prompt_length],
131
+ outputs=output
132
+ )
133
+
134
+ with gr.Tab("Visual Analysis"):
135
+ with gr.Row():
136
+ with gr.Column():
137
+ image_input = gr.Image(label="Upload Image")
138
+ image_question = gr.Textbox(
139
+ label="Question (optional)",
140
+ placeholder="What is in this image?"
141
+ )
142
+ analyze_image_btn = gr.Button("Analyze Image")
143
+ image_output = gr.Textbox(label="Analysis Result", lines=5)
144
+
145
+ with gr.Column():
146
+ video_input = gr.Video(label="Upload Video")
147
+ analyze_video_btn = gr.Button("Analyze Video")
148
+ video_output = gr.Textbox(label="Video Analysis", lines=10)
149
+
150
+ analyze_image_btn.click(
151
+ llm_node.analyze_image,
152
+ inputs=[image_input, image_question],
153
+ outputs=image_output
154
+ )
155
 
156
+ analyze_video_btn.click(
157
+ llm_node.analyze_video,
158
+ inputs=video_input,
159
+ outputs=video_output
160
+ )
161
 
162
  return demo
163
 
llm_inference_video.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import random
3
  from groq import Groq
4
  from openai import OpenAI
5
- from gradio_client import Client
6
 
7
  class VideoLLMInferenceNode:
8
  def __init__(self):
@@ -14,6 +14,17 @@ class VideoLLMInferenceNode:
14
  api_key=self.sambanova_api_key,
15
  base_url="https://api.sambanova.ai/v1",
16
  )
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def generate_video_prompt(
19
  self,
 
2
  import random
3
  from groq import Groq
4
  from openai import OpenAI
5
+ from vlm_captions import VLMCaptioning
6
 
7
  class VideoLLMInferenceNode:
8
  def __init__(self):
 
14
  api_key=self.sambanova_api_key,
15
  base_url="https://api.sambanova.ai/v1",
16
  )
17
+
18
+ # Initialize VLM captioning
19
+ self.vlm = VLMCaptioning()
20
+
21
+ def analyze_image(self, image_path, question=None):
22
+ """Analyze image using MiniCPM-O"""
23
+ return self.vlm.analyze_image(image_path, question)
24
+
25
+ def analyze_video(self, video_path):
26
+ """Analyze video using MiniCPM-O"""
27
+ return self.vlm.analyze_video_frames(video_path)
28
 
29
  def generate_video_prompt(
30
  self,
vlm_captions.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ from transformers import AutoModel, AutoTokenizer
4
+ from decord import VideoReader, cpu
5
+ import spaces
6
+
7
+ class VLMCaptioning:
8
+ def __init__(self):
9
+ print("Loading MiniCPM-O model...")
10
+ self.model = AutoModel.from_pretrained(
11
+ 'openbmb/MiniCPM-o-2_6',
12
+ trust_remote_code=True,
13
+ attn_implementation='sdpa',
14
+ torch_dtype=torch.bfloat16
15
+ )
16
+ self.model = self.model.eval().cuda()
17
+ self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
18
+
19
+ @spaces.GPU()
20
+ def analyze_image(self, image_path, question="Describe this image in detail."):
21
+ """Generate description for a single image"""
22
+ try:
23
+ image = Image.open(image_path).convert('RGB')
24
+ msgs = [{'role': 'user', 'content': [image, question]}]
25
+
26
+ response = self.model.chat(
27
+ image=None,
28
+ msgs=msgs,
29
+ tokenizer=self.tokenizer
30
+ )
31
+ return response
32
+ except Exception as e:
33
+ return f"Error analyzing image: {str(e)}"
34
+
35
+ @spaces.GPU()
36
+ def analyze_video_frames(self, video_path, frame_interval=30):
37
+ """Extract and analyze frames from video"""
38
+ try:
39
+ # Load video
40
+ vr = VideoReader(video_path, ctx=cpu(0))
41
+ total_frames = len(vr)
42
+
43
+ # Extract frames at intervals
44
+ frame_indices = list(range(0, total_frames, frame_interval))
45
+ frames = vr.get_batch(frame_indices).asnumpy()
46
+
47
+ descriptions = []
48
+ for frame in frames:
49
+ # Convert frame to PIL Image
50
+ frame_pil = Image.fromarray(frame)
51
+
52
+ # Generate description
53
+ msgs = [{'role': 'user', 'content': [frame_pil, "Describe the main action in this scene."]}]
54
+ description = self.model.chat(
55
+ image=None,
56
+ msgs=msgs,
57
+ tokenizer=self.tokenizer
58
+ )
59
+ descriptions.append(description)
60
+
61
+ return descriptions
62
+
63
+ except Exception as e:
64
+ return [f"Error processing video: {str(e)}"]