Spaces:

becteur92
/

smollvm

Paused

App Files Files Community

youssef commited on Feb 23

Commit

c0d1640

1 Parent(s): abf26d0

optimize

Browse files

Files changed (1) hide show

src/video_processor/processor.py +40 -38

src/video_processor/processor.py CHANGED Viewed

@@ -55,34 +55,40 @@ class VideoAnalyzer:
             self.model_path,
             torch_dtype=torch.bfloat16,
             device_map=DEVICE,
-            _attn_implementation="flash_attention_2"
         ).to(DEVICE)
-        logger.info(f"Model loaded on device: {self.model.device}")
     def analyze_segment(self, video_path: str, start_time: float) -> str:
         """Analyze a single video segment."""
         messages = [
             {
                 "role": "system",
-                "content": [{"type": "text", "text": """You are a detailed video analysis assistant with expertise in scene description. Your task is to:
-1. Describe the visual content with precise details
-2. Note any significant actions or movements
-3. Describe important objects, people, or elements in the scene
-4. Capture the mood, atmosphere, or emotional content if present
-5. Mention any scene transitions or camera movements
-Be specific and thorough, but focus only on what is visually present in this segment."""}]
             },
             {
                 "role": "user",
                 "content": [
                     {"type": "video", "path": video_path},
-                    {"type": "text", "text": """Describe this video segment in detail. Focus on:
-- What objects, people, or elements are visible?
-- What actions or movements are occurring?
-- What is the setting or environment?
-- Are there any notable visual effects or transitions?
-- What is the overall mood or atmosphere?
-Be specific about visual details but stay concise."""}
                 ]
             }
         ]
@@ -95,12 +101,13 @@ Be specific about visual details but stay concise."""}
             return_tensors="pt"
         ).to(DEVICE, dtype=torch.bfloat16)
-        outputs = self.model.generate(
-            **inputs,
-            do_sample=True,
-            temperature=0.7,
-            max_new_tokens=256
-        )
         return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
     def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
@@ -126,24 +133,19 @@ Be specific about visual details but stay concise."""}
                 # Create segment - Optimized ffmpeg settings
                 segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
                 cmd = [
-                    "ffmpeg",
-                    "-y",  # Overwrite output files
-                    "-threads", "4",  # Use 4 threads
-                    "-i", video_path,
-                    "-ss", str(start_time),  # Seek position
-                    "-t", str(end_time - start_time),  # Duration
-                    "-c:v", "libx264",  # Video codec
-                    "-preset", "ultrafast",
-                    "-tune", "fastdecode",
-                    "-crf", "28",  # Lower quality but faster
-                    "-vf", "scale=640:-2",  # Resize to smaller resolution
-                    "-an",  # Remove audio
-                    "-pix_fmt", "yuv420p",
-                    segment_path
-                ]
                 ffmpeg_start = time.time()
-                subprocess.run(cmd, check=True, capture_output=True)
                 ffmpeg_time = time.time() - ffmpeg_start
                 # Analyze segment

             self.model_path,
             torch_dtype=torch.bfloat16,
             device_map=DEVICE,
+            _attn_implementation="flash_attention_2",
+            low_cpu_mem_usage=True,
         ).to(DEVICE)
+        # Compile model for faster inference
+        self.model = torch.compile(self.model, mode="reduce-overhead")
+        logger.info(f"Model loaded and compiled on device: {self.model.device}")
     def analyze_segment(self, video_path: str, start_time: float) -> str:
         """Analyze a single video segment."""
         messages = [
             {
                 "role": "system",
+                "content": [{"type": "text", "text": """You are a detailed video analysis assistant. Analyze and describe:
+1. People: their appearance, actions, and interactions
+2. Environment: location, weather, time of day, lighting
+3. Objects: key items, their positions and movements
+4. Text: any visible text, signs, or captions
+5. Events: what is happening in sequence
+6. Visual details: colors, patterns, visual effects
+Be specific about timing and details to enable searching through the video later."""}]
             },
             {
                 "role": "user",
                 "content": [
                     {"type": "video", "path": video_path},
+                    {"type": "text", "text": """Describe this segment comprehensively. Include:
+- Who appears and what are they doing?
+- What is the environment and weather like?
+- What objects or items are visible?
+- Is there any text visible on screen?
+- What actions or events are occurring?
+- Note any significant visual details
+Be specific about all visual elements to enable searching later."""}
                 ]
             }
         ]
             return_tensors="pt"
         ).to(DEVICE, dtype=torch.bfloat16)
+        with torch.inference_mode():
+            outputs = self.model.generate(
+                **inputs,
+                do_sample=False,
+                temperature=0.7,
+                max_new_tokens=256,
+            )
         return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
     def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
                 # Create segment - Optimized ffmpeg settings
                 segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
                 cmd = [
+                        "ffmpeg",
+                        "-y",
+                        "-i", video_path,
+                        "-ss", str(start_time),
+                        "-t", str(segment_length),
+                        "-c:v", "libx264",
+                        "-preset", "ultrafast",  # Use ultrafast preset for speed
+                        "-pix_fmt", "yuv420p",   # Ensure compatible pixel format
+                        segment_path
+                    ]
                 ffmpeg_start = time.time()
+                subprocess.run(cmd, check=True)
                 ffmpeg_time = time.time() - ffmpeg_start
                 # Analyze segment