Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
b1c4e30
1
Parent(s):
91840f8
understanding
Browse files- llm_inference_video.py +2 -2
- vlm_captions.py +81 -25
llm_inference_video.py
CHANGED
@@ -20,11 +20,11 @@ class VideoLLMInferenceNode:
|
|
20 |
|
21 |
def analyze_image(self, image_path, question=None):
|
22 |
"""Analyze image using MiniCPM-O"""
|
23 |
-
return self.vlm.
|
24 |
|
25 |
def analyze_video(self, video_path):
|
26 |
"""Analyze video using MiniCPM-O"""
|
27 |
-
return self.vlm.
|
28 |
|
29 |
def generate_video_prompt(
|
30 |
self,
|
|
|
20 |
|
21 |
def analyze_image(self, image_path, question=None):
|
22 |
"""Analyze image using MiniCPM-O"""
|
23 |
+
return self.vlm.describe_image(image_path, question)
|
24 |
|
25 |
def analyze_video(self, video_path):
|
26 |
"""Analyze video using MiniCPM-O"""
|
27 |
+
return self.vlm.describe_video(video_path)
|
28 |
|
29 |
def generate_video_prompt(
|
30 |
self,
|
vlm_captions.py
CHANGED
@@ -17,48 +17,104 @@ class VLMCaptioning:
|
|
17 |
self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
|
18 |
|
19 |
@spaces.GPU()
|
20 |
-
def
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
try:
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
msgs = [{'role': 'user', 'content': [image, question]}]
|
25 |
|
|
|
26 |
response = self.model.chat(
|
27 |
image=None,
|
28 |
msgs=msgs,
|
29 |
-
tokenizer=self.tokenizer
|
|
|
|
|
|
|
|
|
30 |
)
|
31 |
return response
|
32 |
except Exception as e:
|
33 |
return f"Error analyzing image: {str(e)}"
|
34 |
-
|
35 |
@spaces.GPU()
|
36 |
-
def
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
try:
|
39 |
-
# Load video
|
40 |
vr = VideoReader(video_path, ctx=cpu(0))
|
41 |
total_frames = len(vr)
|
42 |
-
|
43 |
-
# Extract frames at intervals
|
44 |
frame_indices = list(range(0, total_frames, frame_interval))
|
45 |
frames = vr.get_batch(frame_indices).asnumpy()
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
# Generate description
|
53 |
-
msgs = [{'role': 'user', 'content': [frame_pil, "Describe the main action in this scene."]}]
|
54 |
-
description = self.model.chat(
|
55 |
-
image=None,
|
56 |
-
msgs=msgs,
|
57 |
-
tokenizer=self.tokenizer
|
58 |
-
)
|
59 |
-
descriptions.append(description)
|
60 |
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
except Exception as e:
|
64 |
-
return
|
|
|
17 |
self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
|
18 |
|
19 |
@spaces.GPU()
|
20 |
+
def describe_image(
|
21 |
+
self,
|
22 |
+
image: str,
|
23 |
+
question: str = "Describe this image in detail.",
|
24 |
+
temperature: float = 0.7,
|
25 |
+
top_p: float = 0.9,
|
26 |
+
top_k: int = 40,
|
27 |
+
max_new_tokens: int = 512,
|
28 |
+
) -> str:
|
29 |
+
"""
|
30 |
+
Generate description for a single image
|
31 |
+
|
32 |
+
Args:
|
33 |
+
image (str): Path to image file
|
34 |
+
question (str): Question to ask about the image
|
35 |
+
temperature (float): Sampling temperature
|
36 |
+
top_p (float): Nucleus sampling parameter
|
37 |
+
top_k (int): Top-k sampling parameter
|
38 |
+
max_new_tokens (int): Maximum new tokens to generate
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
str: Generated description
|
42 |
+
"""
|
43 |
try:
|
44 |
+
if not image:
|
45 |
+
return "Please provide an image."
|
46 |
+
|
47 |
+
# Convert image to RGB
|
48 |
+
image = Image.open(image).convert('RGB')
|
49 |
+
|
50 |
+
# Prepare message
|
51 |
msgs = [{'role': 'user', 'content': [image, question]}]
|
52 |
|
53 |
+
# Generate response
|
54 |
response = self.model.chat(
|
55 |
image=None,
|
56 |
msgs=msgs,
|
57 |
+
tokenizer=self.tokenizer,
|
58 |
+
temperature=temperature,
|
59 |
+
top_p=top_p,
|
60 |
+
top_k=top_k,
|
61 |
+
max_new_tokens=max_new_tokens
|
62 |
)
|
63 |
return response
|
64 |
except Exception as e:
|
65 |
return f"Error analyzing image: {str(e)}"
|
66 |
+
|
67 |
@spaces.GPU()
|
68 |
+
def describe_video(
|
69 |
+
self,
|
70 |
+
video_path: str,
|
71 |
+
frame_interval: int = 30,
|
72 |
+
temperature: float = 0.7,
|
73 |
+
top_p: float = 0.9,
|
74 |
+
top_k: int = 40,
|
75 |
+
max_new_tokens: int = 512,
|
76 |
+
) -> str:
|
77 |
+
"""
|
78 |
+
Generate description for video frames
|
79 |
+
|
80 |
+
Args:
|
81 |
+
video_path (str): Path to video file
|
82 |
+
frame_interval (int): Interval between frames to analyze
|
83 |
+
temperature (float): Sampling temperature
|
84 |
+
top_p (float): Nucleus sampling parameter
|
85 |
+
top_k (int): Top-k sampling parameter
|
86 |
+
max_new_tokens (int): Maximum new tokens to generate
|
87 |
+
|
88 |
+
Returns:
|
89 |
+
str: Generated description
|
90 |
+
"""
|
91 |
try:
|
92 |
+
# Load video and extract frames
|
93 |
vr = VideoReader(video_path, ctx=cpu(0))
|
94 |
total_frames = len(vr)
|
|
|
|
|
95 |
frame_indices = list(range(0, total_frames, frame_interval))
|
96 |
frames = vr.get_batch(frame_indices).asnumpy()
|
97 |
+
|
98 |
+
# Convert frames to PIL Images
|
99 |
+
frame_images = [Image.fromarray(frame) for frame in frames]
|
100 |
|
101 |
+
# Prepare messages for all frames
|
102 |
+
msgs = [
|
103 |
+
{'role': 'user', 'content': [frame, "Describe the main action in this scene."]}
|
104 |
+
for frame in frame_images
|
105 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
+
# Generate response for all frames at once
|
108 |
+
response = self.model.chat(
|
109 |
+
image=None,
|
110 |
+
msgs=msgs,
|
111 |
+
tokenizer=self.tokenizer,
|
112 |
+
temperature=temperature,
|
113 |
+
top_p=top_p,
|
114 |
+
top_k=top_k,
|
115 |
+
max_new_tokens=max_new_tokens
|
116 |
+
)
|
117 |
+
return response
|
118 |
|
119 |
except Exception as e:
|
120 |
+
return f"Error processing video: {str(e)}"
|