youssef commited on
Commit
c0d1640
·
1 Parent(s): abf26d0
Files changed (1) hide show
  1. src/video_processor/processor.py +40 -38
src/video_processor/processor.py CHANGED
@@ -55,34 +55,40 @@ class VideoAnalyzer:
55
  self.model_path,
56
  torch_dtype=torch.bfloat16,
57
  device_map=DEVICE,
58
- _attn_implementation="flash_attention_2"
 
59
  ).to(DEVICE)
60
- logger.info(f"Model loaded on device: {self.model.device}")
 
 
 
61
 
62
  def analyze_segment(self, video_path: str, start_time: float) -> str:
63
  """Analyze a single video segment."""
64
  messages = [
65
  {
66
  "role": "system",
67
- "content": [{"type": "text", "text": """You are a detailed video analysis assistant with expertise in scene description. Your task is to:
68
- 1. Describe the visual content with precise details
69
- 2. Note any significant actions or movements
70
- 3. Describe important objects, people, or elements in the scene
71
- 4. Capture the mood, atmosphere, or emotional content if present
72
- 5. Mention any scene transitions or camera movements
73
- Be specific and thorough, but focus only on what is visually present in this segment."""}]
 
74
  },
75
  {
76
  "role": "user",
77
  "content": [
78
  {"type": "video", "path": video_path},
79
- {"type": "text", "text": """Describe this video segment in detail. Focus on:
80
- - What objects, people, or elements are visible?
81
- - What actions or movements are occurring?
82
- - What is the setting or environment?
83
- - Are there any notable visual effects or transitions?
84
- - What is the overall mood or atmosphere?
85
- Be specific about visual details but stay concise."""}
 
86
  ]
87
  }
88
  ]
@@ -95,12 +101,13 @@ Be specific about visual details but stay concise."""}
95
  return_tensors="pt"
96
  ).to(DEVICE, dtype=torch.bfloat16)
97
 
98
- outputs = self.model.generate(
99
- **inputs,
100
- do_sample=True,
101
- temperature=0.7,
102
- max_new_tokens=256
103
- )
 
104
  return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
105
 
106
  def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
@@ -126,24 +133,19 @@ Be specific about visual details but stay concise."""}
126
  # Create segment - Optimized ffmpeg settings
127
  segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
128
  cmd = [
129
- "ffmpeg",
130
- "-y", # Overwrite output files
131
- "-threads", "4", # Use 4 threads
132
- "-i", video_path,
133
- "-ss", str(start_time), # Seek position
134
- "-t", str(end_time - start_time), # Duration
135
- "-c:v", "libx264", # Video codec
136
- "-preset", "ultrafast",
137
- "-tune", "fastdecode",
138
- "-crf", "28", # Lower quality but faster
139
- "-vf", "scale=640:-2", # Resize to smaller resolution
140
- "-an", # Remove audio
141
- "-pix_fmt", "yuv420p",
142
- segment_path
143
- ]
144
 
145
  ffmpeg_start = time.time()
146
- subprocess.run(cmd, check=True, capture_output=True)
147
  ffmpeg_time = time.time() - ffmpeg_start
148
 
149
  # Analyze segment
 
55
  self.model_path,
56
  torch_dtype=torch.bfloat16,
57
  device_map=DEVICE,
58
+ _attn_implementation="flash_attention_2",
59
+ low_cpu_mem_usage=True,
60
  ).to(DEVICE)
61
+
62
+ # Compile model for faster inference
63
+ self.model = torch.compile(self.model, mode="reduce-overhead")
64
+ logger.info(f"Model loaded and compiled on device: {self.model.device}")
65
 
66
  def analyze_segment(self, video_path: str, start_time: float) -> str:
67
  """Analyze a single video segment."""
68
  messages = [
69
  {
70
  "role": "system",
71
+ "content": [{"type": "text", "text": """You are a detailed video analysis assistant. Analyze and describe:
72
+ 1. People: their appearance, actions, and interactions
73
+ 2. Environment: location, weather, time of day, lighting
74
+ 3. Objects: key items, their positions and movements
75
+ 4. Text: any visible text, signs, or captions
76
+ 5. Events: what is happening in sequence
77
+ 6. Visual details: colors, patterns, visual effects
78
+ Be specific about timing and details to enable searching through the video later."""}]
79
  },
80
  {
81
  "role": "user",
82
  "content": [
83
  {"type": "video", "path": video_path},
84
+ {"type": "text", "text": """Describe this segment comprehensively. Include:
85
+ - Who appears and what are they doing?
86
+ - What is the environment and weather like?
87
+ - What objects or items are visible?
88
+ - Is there any text visible on screen?
89
+ - What actions or events are occurring?
90
+ - Note any significant visual details
91
+ Be specific about all visual elements to enable searching later."""}
92
  ]
93
  }
94
  ]
 
101
  return_tensors="pt"
102
  ).to(DEVICE, dtype=torch.bfloat16)
103
 
104
+ with torch.inference_mode():
105
+ outputs = self.model.generate(
106
+ **inputs,
107
+ do_sample=False,
108
+ temperature=0.7,
109
+ max_new_tokens=256,
110
+ )
111
  return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
112
 
113
  def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
 
133
  # Create segment - Optimized ffmpeg settings
134
  segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
135
  cmd = [
136
+ "ffmpeg",
137
+ "-y",
138
+ "-i", video_path,
139
+ "-ss", str(start_time),
140
+ "-t", str(segment_length),
141
+ "-c:v", "libx264",
142
+ "-preset", "ultrafast", # Use ultrafast preset for speed
143
+ "-pix_fmt", "yuv420p", # Ensure compatible pixel format
144
+ segment_path
145
+ ]
 
 
 
 
 
146
 
147
  ffmpeg_start = time.time()
148
+ subprocess.run(cmd, check=True)
149
  ffmpeg_time = time.time() - ffmpeg_start
150
 
151
  # Analyze segment