youssef
commited on
Commit
·
c0d1640
1
Parent(s):
abf26d0
optimize
Browse files- src/video_processor/processor.py +40 -38
src/video_processor/processor.py
CHANGED
@@ -55,34 +55,40 @@ class VideoAnalyzer:
|
|
55 |
self.model_path,
|
56 |
torch_dtype=torch.bfloat16,
|
57 |
device_map=DEVICE,
|
58 |
-
_attn_implementation="flash_attention_2"
|
|
|
59 |
).to(DEVICE)
|
60 |
-
|
|
|
|
|
|
|
61 |
|
62 |
def analyze_segment(self, video_path: str, start_time: float) -> str:
|
63 |
"""Analyze a single video segment."""
|
64 |
messages = [
|
65 |
{
|
66 |
"role": "system",
|
67 |
-
"content": [{"type": "text", "text": """You are a detailed video analysis assistant
|
68 |
-
1.
|
69 |
-
2.
|
70 |
-
3.
|
71 |
-
4.
|
72 |
-
5.
|
73 |
-
|
|
|
74 |
},
|
75 |
{
|
76 |
"role": "user",
|
77 |
"content": [
|
78 |
{"type": "video", "path": video_path},
|
79 |
-
{"type": "text", "text": """Describe this
|
80 |
-
-
|
81 |
-
- What
|
82 |
-
- What
|
83 |
-
-
|
84 |
-
- What
|
85 |
-
|
|
|
86 |
]
|
87 |
}
|
88 |
]
|
@@ -95,12 +101,13 @@ Be specific about visual details but stay concise."""}
|
|
95 |
return_tensors="pt"
|
96 |
).to(DEVICE, dtype=torch.bfloat16)
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
104 |
return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
|
105 |
|
106 |
def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
|
@@ -126,24 +133,19 @@ Be specific about visual details but stay concise."""}
|
|
126 |
# Create segment - Optimized ffmpeg settings
|
127 |
segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
|
128 |
cmd = [
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
"-vf", "scale=640:-2", # Resize to smaller resolution
|
140 |
-
"-an", # Remove audio
|
141 |
-
"-pix_fmt", "yuv420p",
|
142 |
-
segment_path
|
143 |
-
]
|
144 |
|
145 |
ffmpeg_start = time.time()
|
146 |
-
subprocess.run(cmd, check=True
|
147 |
ffmpeg_time = time.time() - ffmpeg_start
|
148 |
|
149 |
# Analyze segment
|
|
|
55 |
self.model_path,
|
56 |
torch_dtype=torch.bfloat16,
|
57 |
device_map=DEVICE,
|
58 |
+
_attn_implementation="flash_attention_2",
|
59 |
+
low_cpu_mem_usage=True,
|
60 |
).to(DEVICE)
|
61 |
+
|
62 |
+
# Compile model for faster inference
|
63 |
+
self.model = torch.compile(self.model, mode="reduce-overhead")
|
64 |
+
logger.info(f"Model loaded and compiled on device: {self.model.device}")
|
65 |
|
66 |
def analyze_segment(self, video_path: str, start_time: float) -> str:
|
67 |
"""Analyze a single video segment."""
|
68 |
messages = [
|
69 |
{
|
70 |
"role": "system",
|
71 |
+
"content": [{"type": "text", "text": """You are a detailed video analysis assistant. Analyze and describe:
|
72 |
+
1. People: their appearance, actions, and interactions
|
73 |
+
2. Environment: location, weather, time of day, lighting
|
74 |
+
3. Objects: key items, their positions and movements
|
75 |
+
4. Text: any visible text, signs, or captions
|
76 |
+
5. Events: what is happening in sequence
|
77 |
+
6. Visual details: colors, patterns, visual effects
|
78 |
+
Be specific about timing and details to enable searching through the video later."""}]
|
79 |
},
|
80 |
{
|
81 |
"role": "user",
|
82 |
"content": [
|
83 |
{"type": "video", "path": video_path},
|
84 |
+
{"type": "text", "text": """Describe this segment comprehensively. Include:
|
85 |
+
- Who appears and what are they doing?
|
86 |
+
- What is the environment and weather like?
|
87 |
+
- What objects or items are visible?
|
88 |
+
- Is there any text visible on screen?
|
89 |
+
- What actions or events are occurring?
|
90 |
+
- Note any significant visual details
|
91 |
+
Be specific about all visual elements to enable searching later."""}
|
92 |
]
|
93 |
}
|
94 |
]
|
|
|
101 |
return_tensors="pt"
|
102 |
).to(DEVICE, dtype=torch.bfloat16)
|
103 |
|
104 |
+
with torch.inference_mode():
|
105 |
+
outputs = self.model.generate(
|
106 |
+
**inputs,
|
107 |
+
do_sample=False,
|
108 |
+
temperature=0.7,
|
109 |
+
max_new_tokens=256,
|
110 |
+
)
|
111 |
return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
|
112 |
|
113 |
def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
|
|
|
133 |
# Create segment - Optimized ffmpeg settings
|
134 |
segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
|
135 |
cmd = [
|
136 |
+
"ffmpeg",
|
137 |
+
"-y",
|
138 |
+
"-i", video_path,
|
139 |
+
"-ss", str(start_time),
|
140 |
+
"-t", str(segment_length),
|
141 |
+
"-c:v", "libx264",
|
142 |
+
"-preset", "ultrafast", # Use ultrafast preset for speed
|
143 |
+
"-pix_fmt", "yuv420p", # Ensure compatible pixel format
|
144 |
+
segment_path
|
145 |
+
]
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
ffmpeg_start = time.time()
|
148 |
+
subprocess.run(cmd, check=True)
|
149 |
ffmpeg_time = time.time() - ffmpeg_start
|
150 |
|
151 |
# Analyze segment
|