gokaygokay commited on
Commit
b1c4e30
·
1 Parent(s): 91840f8

understanding

Browse files
Files changed (2) hide show
  1. llm_inference_video.py +2 -2
  2. vlm_captions.py +81 -25
llm_inference_video.py CHANGED
@@ -20,11 +20,11 @@ class VideoLLMInferenceNode:
20
 
21
  def analyze_image(self, image_path, question=None):
22
  """Analyze image using MiniCPM-O"""
23
- return self.vlm.analyze_image(image_path, question)
24
 
25
  def analyze_video(self, video_path):
26
  """Analyze video using MiniCPM-O"""
27
- return self.vlm.analyze_video_frames(video_path)
28
 
29
  def generate_video_prompt(
30
  self,
 
20
 
21
  def analyze_image(self, image_path, question=None):
22
  """Analyze image using MiniCPM-O"""
23
+ return self.vlm.describe_image(image_path, question)
24
 
25
  def analyze_video(self, video_path):
26
  """Analyze video using MiniCPM-O"""
27
+ return self.vlm.describe_video(video_path)
28
 
29
  def generate_video_prompt(
30
  self,
vlm_captions.py CHANGED
@@ -17,48 +17,104 @@ class VLMCaptioning:
17
  self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
18
 
19
  @spaces.GPU()
20
- def analyze_image(self, image_path, question="Describe this image in detail."):
21
- """Generate description for a single image"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  try:
23
- image = Image.open(image_path).convert('RGB')
 
 
 
 
 
 
24
  msgs = [{'role': 'user', 'content': [image, question]}]
25
 
 
26
  response = self.model.chat(
27
  image=None,
28
  msgs=msgs,
29
- tokenizer=self.tokenizer
 
 
 
 
30
  )
31
  return response
32
  except Exception as e:
33
  return f"Error analyzing image: {str(e)}"
34
-
35
  @spaces.GPU()
36
- def analyze_video_frames(self, video_path, frame_interval=30):
37
- """Extract and analyze frames from video"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  try:
39
- # Load video
40
  vr = VideoReader(video_path, ctx=cpu(0))
41
  total_frames = len(vr)
42
-
43
- # Extract frames at intervals
44
  frame_indices = list(range(0, total_frames, frame_interval))
45
  frames = vr.get_batch(frame_indices).asnumpy()
 
 
 
46
 
47
- descriptions = []
48
- for frame in frames:
49
- # Convert frame to PIL Image
50
- frame_pil = Image.fromarray(frame)
51
-
52
- # Generate description
53
- msgs = [{'role': 'user', 'content': [frame_pil, "Describe the main action in this scene."]}]
54
- description = self.model.chat(
55
- image=None,
56
- msgs=msgs,
57
- tokenizer=self.tokenizer
58
- )
59
- descriptions.append(description)
60
 
61
- return descriptions
 
 
 
 
 
 
 
 
 
 
62
 
63
  except Exception as e:
64
- return [f"Error processing video: {str(e)}"]
 
17
  self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
18
 
19
  @spaces.GPU()
20
+ def describe_image(
21
+ self,
22
+ image: str,
23
+ question: str = "Describe this image in detail.",
24
+ temperature: float = 0.7,
25
+ top_p: float = 0.9,
26
+ top_k: int = 40,
27
+ max_new_tokens: int = 512,
28
+ ) -> str:
29
+ """
30
+ Generate description for a single image
31
+
32
+ Args:
33
+ image (str): Path to image file
34
+ question (str): Question to ask about the image
35
+ temperature (float): Sampling temperature
36
+ top_p (float): Nucleus sampling parameter
37
+ top_k (int): Top-k sampling parameter
38
+ max_new_tokens (int): Maximum new tokens to generate
39
+
40
+ Returns:
41
+ str: Generated description
42
+ """
43
  try:
44
+ if not image:
45
+ return "Please provide an image."
46
+
47
+ # Convert image to RGB
48
+ image = Image.open(image).convert('RGB')
49
+
50
+ # Prepare message
51
  msgs = [{'role': 'user', 'content': [image, question]}]
52
 
53
+ # Generate response
54
  response = self.model.chat(
55
  image=None,
56
  msgs=msgs,
57
+ tokenizer=self.tokenizer,
58
+ temperature=temperature,
59
+ top_p=top_p,
60
+ top_k=top_k,
61
+ max_new_tokens=max_new_tokens
62
  )
63
  return response
64
  except Exception as e:
65
  return f"Error analyzing image: {str(e)}"
66
+
67
  @spaces.GPU()
68
+ def describe_video(
69
+ self,
70
+ video_path: str,
71
+ frame_interval: int = 30,
72
+ temperature: float = 0.7,
73
+ top_p: float = 0.9,
74
+ top_k: int = 40,
75
+ max_new_tokens: int = 512,
76
+ ) -> str:
77
+ """
78
+ Generate description for video frames
79
+
80
+ Args:
81
+ video_path (str): Path to video file
82
+ frame_interval (int): Interval between frames to analyze
83
+ temperature (float): Sampling temperature
84
+ top_p (float): Nucleus sampling parameter
85
+ top_k (int): Top-k sampling parameter
86
+ max_new_tokens (int): Maximum new tokens to generate
87
+
88
+ Returns:
89
+ str: Generated description
90
+ """
91
  try:
92
+ # Load video and extract frames
93
  vr = VideoReader(video_path, ctx=cpu(0))
94
  total_frames = len(vr)
 
 
95
  frame_indices = list(range(0, total_frames, frame_interval))
96
  frames = vr.get_batch(frame_indices).asnumpy()
97
+
98
+ # Convert frames to PIL Images
99
+ frame_images = [Image.fromarray(frame) for frame in frames]
100
 
101
+ # Prepare messages for all frames
102
+ msgs = [
103
+ {'role': 'user', 'content': [frame, "Describe the main action in this scene."]}
104
+ for frame in frame_images
105
+ ]
 
 
 
 
 
 
 
 
106
 
107
+ # Generate response for all frames at once
108
+ response = self.model.chat(
109
+ image=None,
110
+ msgs=msgs,
111
+ tokenizer=self.tokenizer,
112
+ temperature=temperature,
113
+ top_p=top_p,
114
+ top_k=top_k,
115
+ max_new_tokens=max_new_tokens
116
+ )
117
+ return response
118
 
119
  except Exception as e:
120
+ return f"Error processing video: {str(e)}"